github.com/rohankumardubey/nomad@v0.11.8/scheduler/generic_sched.go

github.com/rohankumardubey/nomad@v0.11.8/scheduler/generic_sched.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"fmt"
     5  	"time"
     6  
     7  	log "github.com/hashicorp/go-hclog"
     8  	"github.com/hashicorp/go-memdb"
     9  	"github.com/hashicorp/go-multierror"
    10  	"github.com/hashicorp/nomad/helper/uuid"
    11  	"github.com/hashicorp/nomad/nomad/structs"
    12  )
    13  
    14  const (
    15  	// maxServiceScheduleAttempts is used to limit the number of times
    16  	// we will attempt to schedule if we continue to hit conflicts for services.
    17  	maxServiceScheduleAttempts = 5
    18  
    19  	// maxBatchScheduleAttempts is used to limit the number of times
    20  	// we will attempt to schedule if we continue to hit conflicts for batch.
    21  	maxBatchScheduleAttempts = 2
    22  
    23  	// allocNotNeeded is the status used when a job no longer requires an allocation
    24  	allocNotNeeded = "alloc not needed due to job update"
    25  
    26  	// allocMigrating is the status used when we must migrate an allocation
    27  	allocMigrating = "alloc is being migrated"
    28  
    29  	// allocUpdating is the status used when a job requires an update
    30  	allocUpdating = "alloc is being updated due to job update"
    31  
    32  	// allocLost is the status used when an allocation is lost
    33  	allocLost = "alloc is lost since its node is down"
    34  
    35  	// allocInPlace is the status used when speculating on an in-place update
    36  	allocInPlace = "alloc updating in-place"
    37  
    38  	// allocNodeTainted is the status used when stopping an alloc because it's
    39  	// node is tainted.
    40  	allocNodeTainted = "alloc not needed as node is tainted"
    41  
    42  	// allocRescheduled is the status used when an allocation failed and was rescheduled
    43  	allocRescheduled = "alloc was rescheduled because it failed"
    44  
    45  	// blockedEvalMaxPlanDesc is the description used for blocked evals that are
    46  	// a result of hitting the max number of plan attempts
    47  	blockedEvalMaxPlanDesc = "created due to placement conflicts"
    48  
    49  	// blockedEvalFailedPlacements is the description used for blocked evals
    50  	// that are a result of failing to place all allocations.
    51  	blockedEvalFailedPlacements = "created to place remaining allocations"
    52  
    53  	// reschedulingFollowupEvalDesc is the description used when creating follow
    54  	// up evals for delayed rescheduling
    55  	reschedulingFollowupEvalDesc = "created for delayed rescheduling"
    56  
    57  	// maxPastRescheduleEvents is the maximum number of past reschedule event
    58  	// that we track when unlimited rescheduling is enabled
    59  	maxPastRescheduleEvents = 5
    60  )
    61  
    62  // SetStatusError is used to set the status of the evaluation to the given error
    63  type SetStatusError struct {
    64  	Err        error
    65  	EvalStatus string
    66  }
    67  
    68  func (s *SetStatusError) Error() string {
    69  	return s.Err.Error()
    70  }
    71  
    72  // GenericScheduler is used for 'service' and 'batch' type jobs. This scheduler is
    73  // designed for long-lived services, and as such spends more time attempting
    74  // to make a high quality placement. This is the primary scheduler for
    75  // most workloads. It also supports a 'batch' mode to optimize for fast decision
    76  // making at the cost of quality.
    77  type GenericScheduler struct {
    78  	logger  log.Logger
    79  	state   State
    80  	planner Planner
    81  	batch   bool
    82  
    83  	eval       *structs.Evaluation
    84  	job        *structs.Job
    85  	plan       *structs.Plan
    86  	planResult *structs.PlanResult
    87  	ctx        *EvalContext
    88  	stack      *GenericStack
    89  
    90  	// followUpEvals are evals with WaitUntil set, which are delayed until that time
    91  	// before being rescheduled
    92  	followUpEvals []*structs.Evaluation
    93  
    94  	deployment *structs.Deployment
    95  
    96  	blocked        *structs.Evaluation
    97  	failedTGAllocs map[string]*structs.AllocMetric
    98  	queuedAllocs   map[string]int
    99  }
   100  
   101  // NewServiceScheduler is a factory function to instantiate a new service scheduler
   102  func NewServiceScheduler(logger log.Logger, state State, planner Planner) Scheduler {
   103  	s := &GenericScheduler{
   104  		logger:  logger.Named("service_sched"),
   105  		state:   state,
   106  		planner: planner,
   107  		batch:   false,
   108  	}
   109  	return s
   110  }
   111  
   112  // NewBatchScheduler is a factory function to instantiate a new batch scheduler
   113  func NewBatchScheduler(logger log.Logger, state State, planner Planner) Scheduler {
   114  	s := &GenericScheduler{
   115  		logger:  logger.Named("batch_sched"),
   116  		state:   state,
   117  		planner: planner,
   118  		batch:   true,
   119  	}
   120  	return s
   121  }
   122  
   123  // Process is used to handle a single evaluation
   124  func (s *GenericScheduler) Process(eval *structs.Evaluation) error {
   125  	// Store the evaluation
   126  	s.eval = eval
   127  
   128  	// Update our logger with the eval's information
   129  	s.logger = s.logger.With("eval_id", eval.ID, "job_id", eval.JobID, "namespace", eval.Namespace)
   130  
   131  	// Verify the evaluation trigger reason is understood
   132  	switch eval.TriggeredBy {
   133  	case structs.EvalTriggerJobRegister, structs.EvalTriggerJobDeregister,
   134  		structs.EvalTriggerNodeDrain, structs.EvalTriggerNodeUpdate,
   135  		structs.EvalTriggerAllocStop,
   136  		structs.EvalTriggerRollingUpdate, structs.EvalTriggerQueuedAllocs,
   137  		structs.EvalTriggerPeriodicJob, structs.EvalTriggerMaxPlans,
   138  		structs.EvalTriggerDeploymentWatcher, structs.EvalTriggerRetryFailedAlloc,
   139  		structs.EvalTriggerFailedFollowUp, structs.EvalTriggerPreemption,
   140  		structs.EvalTriggerScaling:
   141  	default:
   142  		desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason",
   143  			eval.TriggeredBy)
   144  		return setStatus(s.logger, s.planner, s.eval, nil, s.blocked,
   145  			s.failedTGAllocs, structs.EvalStatusFailed, desc, s.queuedAllocs,
   146  			s.deployment.GetID())
   147  	}
   148  
   149  	// Retry up to the maxScheduleAttempts and reset if progress is made.
   150  	progress := func() bool { return progressMade(s.planResult) }
   151  	limit := maxServiceScheduleAttempts
   152  	if s.batch {
   153  		limit = maxBatchScheduleAttempts
   154  	}
   155  	if err := retryMax(limit, s.process, progress); err != nil {
   156  		if statusErr, ok := err.(*SetStatusError); ok {
   157  			// Scheduling was tried but made no forward progress so create a
   158  			// blocked eval to retry once resources become available.
   159  			var mErr multierror.Error
   160  			if err := s.createBlockedEval(true); err != nil {
   161  				mErr.Errors = append(mErr.Errors, err)
   162  			}
   163  			if err := setStatus(s.logger, s.planner, s.eval, nil, s.blocked,
   164  				s.failedTGAllocs, statusErr.EvalStatus, err.Error(),
   165  				s.queuedAllocs, s.deployment.GetID()); err != nil {
   166  				mErr.Errors = append(mErr.Errors, err)
   167  			}
   168  			return mErr.ErrorOrNil()
   169  		}
   170  		return err
   171  	}
   172  
   173  	// If the current evaluation is a blocked evaluation and we didn't place
   174  	// everything, do not update the status to complete.
   175  	if s.eval.Status == structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 {
   176  		e := s.ctx.Eligibility()
   177  		newEval := s.eval.Copy()
   178  		newEval.EscapedComputedClass = e.HasEscaped()
   179  		newEval.ClassEligibility = e.GetClasses()
   180  		newEval.QuotaLimitReached = e.QuotaLimitReached()
   181  		return s.planner.ReblockEval(newEval)
   182  	}
   183  
   184  	// Update the status to complete
   185  	return setStatus(s.logger, s.planner, s.eval, nil, s.blocked,
   186  		s.failedTGAllocs, structs.EvalStatusComplete, "", s.queuedAllocs,
   187  		s.deployment.GetID())
   188  }
   189  
   190  // createBlockedEval creates a blocked eval and submits it to the planner. If
   191  // failure is set to true, the eval's trigger reason reflects that.
   192  func (s *GenericScheduler) createBlockedEval(planFailure bool) error {
   193  	e := s.ctx.Eligibility()
   194  	escaped := e.HasEscaped()
   195  
   196  	// Only store the eligible classes if the eval hasn't escaped.
   197  	var classEligibility map[string]bool
   198  	if !escaped {
   199  		classEligibility = e.GetClasses()
   200  	}
   201  
   202  	s.blocked = s.eval.CreateBlockedEval(classEligibility, escaped, e.QuotaLimitReached())
   203  	if planFailure {
   204  		s.blocked.TriggeredBy = structs.EvalTriggerMaxPlans
   205  		s.blocked.StatusDescription = blockedEvalMaxPlanDesc
   206  	} else {
   207  		s.blocked.StatusDescription = blockedEvalFailedPlacements
   208  	}
   209  
   210  	return s.planner.CreateEval(s.blocked)
   211  }
   212  
   213  // process is wrapped in retryMax to iteratively run the handler until we have no
   214  // further work or we've made the maximum number of attempts.
   215  func (s *GenericScheduler) process() (bool, error) {
   216  	// Lookup the Job by ID
   217  	var err error
   218  	ws := memdb.NewWatchSet()
   219  	s.job, err = s.state.JobByID(ws, s.eval.Namespace, s.eval.JobID)
   220  	if err != nil {
   221  		return false, fmt.Errorf("failed to get job %q: %v", s.eval.JobID, err)
   222  	}
   223  
   224  	numTaskGroups := 0
   225  	stopped := s.job.Stopped()
   226  	if !stopped {
   227  		numTaskGroups = len(s.job.TaskGroups)
   228  	}
   229  	s.queuedAllocs = make(map[string]int, numTaskGroups)
   230  	s.followUpEvals = nil
   231  
   232  	// Create a plan
   233  	s.plan = s.eval.MakePlan(s.job)
   234  
   235  	if !s.batch {
   236  		// Get any existing deployment
   237  		s.deployment, err = s.state.LatestDeploymentByJobID(ws, s.eval.Namespace, s.eval.JobID)
   238  		if err != nil {
   239  			return false, fmt.Errorf("failed to get job deployment %q: %v", s.eval.JobID, err)
   240  		}
   241  	}
   242  
   243  	// Reset the failed allocations
   244  	s.failedTGAllocs = nil
   245  
   246  	// Create an evaluation context
   247  	s.ctx = NewEvalContext(s.state, s.plan, s.logger)
   248  
   249  	// Construct the placement stack
   250  	s.stack = NewGenericStack(s.batch, s.ctx)
   251  	if !s.job.Stopped() {
   252  		s.stack.SetJob(s.job)
   253  	}
   254  
   255  	// Compute the target job allocations
   256  	if err := s.computeJobAllocs(); err != nil {
   257  		s.logger.Error("failed to compute job allocations", "error", err)
   258  		return false, err
   259  	}
   260  
   261  	// If there are failed allocations, we need to create a blocked evaluation
   262  	// to place the failed allocations when resources become available. If the
   263  	// current evaluation is already a blocked eval, we reuse it. If not, submit
   264  	// a new eval to the planner in createBlockedEval. If rescheduling should
   265  	// be delayed, do that instead.
   266  	delayInstead := len(s.followUpEvals) > 0 && s.eval.WaitUntil.IsZero()
   267  
   268  	if s.eval.Status != structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 && s.blocked == nil &&
   269  		!delayInstead {
   270  		if err := s.createBlockedEval(false); err != nil {
   271  			s.logger.Error("failed to make blocked eval", "error", err)
   272  			return false, err
   273  		}
   274  		s.logger.Debug("failed to place all allocations, blocked eval created", "blocked_eval_id", s.blocked.ID)
   275  	}
   276  
   277  	// If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan
   278  	// anyways to get the annotations.
   279  	if s.plan.IsNoOp() && !s.eval.AnnotatePlan {
   280  		return true, nil
   281  	}
   282  
   283  	// Create follow up evals for any delayed reschedule eligible allocations, except in
   284  	// the case that this evaluation was already delayed.
   285  	if delayInstead {
   286  		for _, eval := range s.followUpEvals {
   287  			eval.PreviousEval = s.eval.ID
   288  			// TODO(preetha) this should be batching evals before inserting them
   289  			if err := s.planner.CreateEval(eval); err != nil {
   290  				s.logger.Error("failed to make next eval for rescheduling", "error", err)
   291  				return false, err
   292  			}
   293  			s.logger.Debug("found reschedulable allocs, followup eval created", "followup_eval_id", eval.ID)
   294  		}
   295  	}
   296  
   297  	// Submit the plan and store the results.
   298  	result, newState, err := s.planner.SubmitPlan(s.plan)
   299  	s.planResult = result
   300  	if err != nil {
   301  		return false, err
   302  	}
   303  
   304  	// Decrement the number of allocations pending per task group based on the
   305  	// number of allocations successfully placed
   306  	adjustQueuedAllocations(s.logger, result, s.queuedAllocs)
   307  
   308  	// If we got a state refresh, try again since we have stale data
   309  	if newState != nil {
   310  		s.logger.Debug("refresh forced")
   311  		s.state = newState
   312  		return false, nil
   313  	}
   314  
   315  	// Try again if the plan was not fully committed, potential conflict
   316  	fullCommit, expected, actual := result.FullCommit(s.plan)
   317  	if !fullCommit {
   318  		s.logger.Debug("plan didn't fully commit", "attempted", expected, "placed", actual)
   319  		if newState == nil {
   320  			return false, fmt.Errorf("missing state refresh after partial commit")
   321  		}
   322  		return false, nil
   323  	}
   324  
   325  	// Success!
   326  	return true, nil
   327  }
   328  
   329  // computeJobAllocs is used to reconcile differences between the job,
   330  // existing allocations and node status to update the allocations.
   331  func (s *GenericScheduler) computeJobAllocs() error {
   332  	// Lookup the allocations by JobID
   333  	ws := memdb.NewWatchSet()
   334  	allocs, err := s.state.AllocsByJob(ws, s.eval.Namespace, s.eval.JobID, true)
   335  	if err != nil {
   336  		return fmt.Errorf("failed to get allocs for job '%s': %v",
   337  			s.eval.JobID, err)
   338  	}
   339  
   340  	// Determine the tainted nodes containing job allocs
   341  	tainted, err := taintedNodes(s.state, allocs)
   342  	if err != nil {
   343  		return fmt.Errorf("failed to get tainted nodes for job '%s': %v",
   344  			s.eval.JobID, err)
   345  	}
   346  
   347  	// Update the allocations which are in pending/running state on tainted
   348  	// nodes to lost, but only if the scheduler has already marked them
   349  	updateNonTerminalAllocsToLost(s.plan, tainted, allocs)
   350  
   351  	reconciler := NewAllocReconciler(s.logger,
   352  		genericAllocUpdateFn(s.ctx, s.stack, s.eval.ID),
   353  		s.batch, s.eval.JobID, s.job, s.deployment, allocs, tainted, s.eval.ID)
   354  	results := reconciler.Compute()
   355  	s.logger.Debug("reconciled current state with desired state", "results", log.Fmt("%#v", results))
   356  
   357  	if s.eval.AnnotatePlan {
   358  		s.plan.Annotations = &structs.PlanAnnotations{
   359  			DesiredTGUpdates: results.desiredTGUpdates,
   360  		}
   361  	}
   362  
   363  	// Add the deployment changes to the plan
   364  	s.plan.Deployment = results.deployment
   365  	s.plan.DeploymentUpdates = results.deploymentUpdates
   366  
   367  	// Store all the follow up evaluations from rescheduled allocations
   368  	if len(results.desiredFollowupEvals) > 0 {
   369  		for _, evals := range results.desiredFollowupEvals {
   370  			s.followUpEvals = append(s.followUpEvals, evals...)
   371  		}
   372  	}
   373  
   374  	// Update the stored deployment
   375  	if results.deployment != nil {
   376  		s.deployment = results.deployment
   377  	}
   378  
   379  	// Handle the stop
   380  	for _, stop := range results.stop {
   381  		s.plan.AppendStoppedAlloc(stop.alloc, stop.statusDescription, stop.clientStatus, stop.followupEvalID)
   382  	}
   383  
   384  	// Handle the in-place updates
   385  	for _, update := range results.inplaceUpdate {
   386  		if update.DeploymentID != s.deployment.GetID() {
   387  			update.DeploymentID = s.deployment.GetID()
   388  			update.DeploymentStatus = nil
   389  		}
   390  		s.ctx.Plan().AppendAlloc(update)
   391  	}
   392  
   393  	// Handle the annotation updates
   394  	for _, update := range results.attributeUpdates {
   395  		s.ctx.Plan().AppendAlloc(update)
   396  	}
   397  
   398  	// Nothing remaining to do if placement is not required
   399  	if len(results.place)+len(results.destructiveUpdate) == 0 {
   400  		// If the job has been purged we don't have access to the job. Otherwise
   401  		// set the queued allocs to zero. This is true if the job is being
   402  		// stopped as well.
   403  		if s.job != nil {
   404  			for _, tg := range s.job.TaskGroups {
   405  				s.queuedAllocs[tg.Name] = 0
   406  			}
   407  		}
   408  		return nil
   409  	}
   410  
   411  	// Record the number of allocations that needs to be placed per Task Group
   412  	for _, place := range results.place {
   413  		s.queuedAllocs[place.taskGroup.Name] += 1
   414  	}
   415  	for _, destructive := range results.destructiveUpdate {
   416  		s.queuedAllocs[destructive.placeTaskGroup.Name] += 1
   417  	}
   418  
   419  	// Compute the placements
   420  	place := make([]placementResult, 0, len(results.place))
   421  	for _, p := range results.place {
   422  		place = append(place, p)
   423  	}
   424  
   425  	destructive := make([]placementResult, 0, len(results.destructiveUpdate))
   426  	for _, p := range results.destructiveUpdate {
   427  		destructive = append(destructive, p)
   428  	}
   429  	return s.computePlacements(destructive, place)
   430  }
   431  
   432  // computePlacements computes placements for allocations. It is given the set of
   433  // destructive updates to place and the set of new placements to place.
   434  func (s *GenericScheduler) computePlacements(destructive, place []placementResult) error {
   435  	// Get the base nodes
   436  	nodes, byDC, err := readyNodesInDCs(s.state, s.job.Datacenters)
   437  	if err != nil {
   438  		return err
   439  	}
   440  
   441  	var deploymentID string
   442  	if s.deployment != nil && s.deployment.Active() {
   443  		deploymentID = s.deployment.ID
   444  	}
   445  
   446  	// Update the set of placement nodes
   447  	s.stack.SetNodes(nodes)
   448  
   449  	// Capture current time to use as the start time for any rescheduled allocations
   450  	now := time.Now()
   451  
   452  	// Have to handle destructive changes first as we need to discount their
   453  	// resources. To understand this imagine the resources were reduced and the
   454  	// count was scaled up.
   455  	for _, results := range [][]placementResult{destructive, place} {
   456  		for _, missing := range results {
   457  			// Get the task group
   458  			tg := missing.TaskGroup()
   459  
   460  			// Check if this task group has already failed
   461  			if metric, ok := s.failedTGAllocs[tg.Name]; ok {
   462  				metric.CoalescedFailures += 1
   463  				continue
   464  			}
   465  
   466  			// Find the preferred node
   467  			preferredNode, err := s.findPreferredNode(missing)
   468  			if err != nil {
   469  				return err
   470  			}
   471  
   472  			// Check if we should stop the previous allocation upon successful
   473  			// placement of its replacement. This allow atomic placements/stops. We
   474  			// stop the allocation before trying to find a replacement because this
   475  			// frees the resources currently used by the previous allocation.
   476  			stopPrevAlloc, stopPrevAllocDesc := missing.StopPreviousAlloc()
   477  			prevAllocation := missing.PreviousAllocation()
   478  			if stopPrevAlloc {
   479  				s.plan.AppendStoppedAlloc(prevAllocation, stopPrevAllocDesc, "", "")
   480  			}
   481  
   482  			// Compute penalty nodes for rescheduled allocs
   483  			selectOptions := getSelectOptions(prevAllocation, preferredNode)
   484  			option := s.selectNextOption(tg, selectOptions)
   485  
   486  			// Store the available nodes by datacenter
   487  			s.ctx.Metrics().NodesAvailable = byDC
   488  
   489  			// Compute top K scoring node metadata
   490  			s.ctx.Metrics().PopulateScoreMetaData()
   491  
   492  			// Set fields based on if we found an allocation option
   493  			if option != nil {
   494  				resources := &structs.AllocatedResources{
   495  					Tasks:          option.TaskResources,
   496  					TaskLifecycles: option.TaskLifecycles,
   497  					Shared: structs.AllocatedSharedResources{
   498  						DiskMB: int64(tg.EphemeralDisk.SizeMB),
   499  					},
   500  				}
   501  				if option.AllocResources != nil {
   502  					resources.Shared.Networks = option.AllocResources.Networks
   503  				}
   504  
   505  				// Create an allocation for this
   506  				alloc := &structs.Allocation{
   507  					ID:                 uuid.Generate(),
   508  					Namespace:          s.job.Namespace,
   509  					EvalID:             s.eval.ID,
   510  					Name:               missing.Name(),
   511  					JobID:              s.job.ID,
   512  					TaskGroup:          tg.Name,
   513  					Metrics:            s.ctx.Metrics(),
   514  					NodeID:             option.Node.ID,
   515  					NodeName:           option.Node.Name,
   516  					DeploymentID:       deploymentID,
   517  					TaskResources:      resources.OldTaskResources(),
   518  					AllocatedResources: resources,
   519  					DesiredStatus:      structs.AllocDesiredStatusRun,
   520  					ClientStatus:       structs.AllocClientStatusPending,
   521  					// SharedResources is considered deprecated, will be removed in 0.11.
   522  					// It is only set for compat reasons.
   523  					SharedResources: &structs.Resources{
   524  						DiskMB:   tg.EphemeralDisk.SizeMB,
   525  						Networks: resources.Shared.Networks,
   526  					},
   527  				}
   528  
   529  				// If the new allocation is replacing an older allocation then we
   530  				// set the record the older allocation id so that they are chained
   531  				if prevAllocation != nil {
   532  					alloc.PreviousAllocation = prevAllocation.ID
   533  					if missing.IsRescheduling() {
   534  						updateRescheduleTracker(alloc, prevAllocation, now)
   535  					}
   536  				}
   537  
   538  				// If we are placing a canary and we found a match, add the canary
   539  				// to the deployment state object and mark it as a canary.
   540  				if missing.Canary() && s.deployment != nil {
   541  					alloc.DeploymentStatus = &structs.AllocDeploymentStatus{
   542  						Canary: true,
   543  					}
   544  				}
   545  
   546  				s.handlePreemptions(option, alloc, missing)
   547  
   548  				// Track the placement
   549  				s.plan.AppendAlloc(alloc)
   550  
   551  			} else {
   552  				// Lazy initialize the failed map
   553  				if s.failedTGAllocs == nil {
   554  					s.failedTGAllocs = make(map[string]*structs.AllocMetric)
   555  				}
   556  
   557  				// Track the fact that we didn't find a placement
   558  				s.failedTGAllocs[tg.Name] = s.ctx.Metrics()
   559  
   560  				// If we weren't able to find a replacement for the allocation, back
   561  				// out the fact that we asked to stop the allocation.
   562  				if stopPrevAlloc {
   563  					s.plan.PopUpdate(prevAllocation)
   564  				}
   565  			}
   566  
   567  		}
   568  	}
   569  
   570  	return nil
   571  }
   572  
   573  // getSelectOptions sets up preferred nodes and penalty nodes
   574  func getSelectOptions(prevAllocation *structs.Allocation, preferredNode *structs.Node) *SelectOptions {
   575  	selectOptions := &SelectOptions{}
   576  	if prevAllocation != nil {
   577  		penaltyNodes := make(map[string]struct{})
   578  
   579  		// If alloc failed, penalize the node it failed on to encourage
   580  		// rescheduling on a new node.
   581  		if prevAllocation.ClientStatus == structs.AllocClientStatusFailed {
   582  			penaltyNodes[prevAllocation.NodeID] = struct{}{}
   583  		}
   584  		if prevAllocation.RescheduleTracker != nil {
   585  			for _, reschedEvent := range prevAllocation.RescheduleTracker.Events {
   586  				penaltyNodes[reschedEvent.PrevNodeID] = struct{}{}
   587  			}
   588  		}
   589  		selectOptions.PenaltyNodeIDs = penaltyNodes
   590  	}
   591  	if preferredNode != nil {
   592  		selectOptions.PreferredNodes = []*structs.Node{preferredNode}
   593  	}
   594  	return selectOptions
   595  }
   596  
   597  // updateRescheduleTracker carries over previous restart attempts and adds the most recent restart
   598  func updateRescheduleTracker(alloc *structs.Allocation, prev *structs.Allocation, now time.Time) {
   599  	reschedPolicy := prev.ReschedulePolicy()
   600  	var rescheduleEvents []*structs.RescheduleEvent
   601  	if prev.RescheduleTracker != nil {
   602  		var interval time.Duration
   603  		if reschedPolicy != nil {
   604  			interval = reschedPolicy.Interval
   605  		}
   606  		// If attempts is set copy all events in the interval range
   607  		if reschedPolicy.Attempts > 0 {
   608  			for _, reschedEvent := range prev.RescheduleTracker.Events {
   609  				timeDiff := now.UnixNano() - reschedEvent.RescheduleTime
   610  				// Only copy over events that are within restart interval
   611  				// This keeps the list of events small in cases where there's a long chain of old restart events
   612  				if interval > 0 && timeDiff <= interval.Nanoseconds() {
   613  					rescheduleEvents = append(rescheduleEvents, reschedEvent.Copy())
   614  				}
   615  			}
   616  		} else {
   617  			// Only copy the last n if unlimited is set
   618  			start := 0
   619  			if len(prev.RescheduleTracker.Events) > maxPastRescheduleEvents {
   620  				start = len(prev.RescheduleTracker.Events) - maxPastRescheduleEvents
   621  			}
   622  			for i := start; i < len(prev.RescheduleTracker.Events); i++ {
   623  				reschedEvent := prev.RescheduleTracker.Events[i]
   624  				rescheduleEvents = append(rescheduleEvents, reschedEvent.Copy())
   625  			}
   626  		}
   627  	}
   628  	nextDelay := prev.NextDelay()
   629  	rescheduleEvent := structs.NewRescheduleEvent(now.UnixNano(), prev.ID, prev.NodeID, nextDelay)
   630  	rescheduleEvents = append(rescheduleEvents, rescheduleEvent)
   631  	alloc.RescheduleTracker = &structs.RescheduleTracker{Events: rescheduleEvents}
   632  }
   633  
   634  // findPreferredNode finds the preferred node for an allocation
   635  func (s *GenericScheduler) findPreferredNode(place placementResult) (*structs.Node, error) {
   636  	if prev := place.PreviousAllocation(); prev != nil && place.TaskGroup().EphemeralDisk.Sticky == true {
   637  		var preferredNode *structs.Node
   638  		ws := memdb.NewWatchSet()
   639  		preferredNode, err := s.state.NodeByID(ws, prev.NodeID)
   640  		if err != nil {
   641  			return nil, err
   642  		}
   643  
   644  		if preferredNode != nil && preferredNode.Ready() {
   645  			return preferredNode, nil
   646  		}
   647  	}
   648  	return nil, nil
   649  }