github.com/hernad/nomad@v1.6.112/nomad/worker.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package nomad
     5  
     6  import (
     7  	"context"
     8  	"encoding/json"
     9  	"errors"
    10  	"fmt"
    11  	"strings"
    12  	"sync"
    13  	"time"
    14  
    15  	metrics "github.com/armon/go-metrics"
    16  	log "github.com/hashicorp/go-hclog"
    17  	memdb "github.com/hashicorp/go-memdb"
    18  	"github.com/hashicorp/go-version"
    19  	"github.com/hernad/nomad/helper/uuid"
    20  	"github.com/hernad/nomad/nomad/state"
    21  	"github.com/hernad/nomad/nomad/structs"
    22  	"github.com/hernad/nomad/scheduler"
    23  )
    24  
    25  const (
    26  	// backoffBaselineFast is the baseline time for exponential backoff
    27  	backoffBaselineFast = 20 * time.Millisecond
    28  
    29  	// backoffBaselineSlow is the baseline time for exponential backoff
    30  	// but that is much slower than backoffBaselineFast
    31  	backoffBaselineSlow = 500 * time.Millisecond
    32  
    33  	// backoffLimitSlow is the limit of the exponential backoff for
    34  	// the slower backoff
    35  	backoffLimitSlow = 10 * time.Second
    36  
    37  	// backoffSchedulerVersionMismatch is the backoff between retries when the
    38  	// scheduler version mismatches that of the leader.
    39  	backoffSchedulerVersionMismatch = 30 * time.Second
    40  
    41  	// dequeueTimeout is used to timeout an evaluation dequeue so that
    42  	// we can check if there is a shutdown event
    43  	dequeueTimeout = 500 * time.Millisecond
    44  
    45  	// raftSyncLimit is the limit of time we will wait for Raft replication
    46  	// to catch up to the evaluation. This is used to fast Nack and
    47  	// allow another scheduler to pick it up.
    48  	raftSyncLimit = 5 * time.Second
    49  
    50  	// dequeueErrGrace is the grace period where we don't log about
    51  	// dequeue errors after start. This is to improve the user experience
    52  	// in dev mode where the leader isn't elected for a few seconds.
    53  	dequeueErrGrace = 10 * time.Second
    54  )
    55  
    56  type WorkerStatus int
    57  
    58  //go:generate stringer -trimprefix=Worker -output worker_string_workerstatus.go -linecomment -type=WorkerStatus
    59  const (
    60  	WorkerUnknownStatus WorkerStatus = iota // Unknown
    61  	WorkerStarting
    62  	WorkerStarted
    63  	WorkerPausing
    64  	WorkerPaused
    65  	WorkerResuming
    66  	WorkerStopping
    67  	WorkerStopped
    68  )
    69  
    70  type SchedulerWorkerStatus int
    71  
    72  //go:generate stringer -trimprefix=Workload -output worker_string_schedulerworkerstatus.go -linecomment -type=SchedulerWorkerStatus
    73  const (
    74  	WorkloadUnknownStatus SchedulerWorkerStatus = iota
    75  	WorkloadRunning
    76  	WorkloadWaitingToDequeue
    77  	WorkloadWaitingForRaft
    78  	WorkloadScheduling
    79  	WorkloadSubmitting
    80  	WorkloadBackoff
    81  	WorkloadStopped
    82  	WorkloadPaused
    83  )
    84  
    85  // Worker is a single threaded scheduling worker. There may be multiple
    86  // running per server (leader or follower). They are responsible for dequeuing
    87  // pending evaluations, invoking schedulers, plan submission and the
    88  // lifecycle around making task allocations. They bridge the business logic
    89  // of the scheduler with the plumbing required to make it all work.
    90  type Worker struct {
    91  	srv    *Server
    92  	logger log.Logger
    93  	start  time.Time
    94  	id     string
    95  
    96  	status         WorkerStatus
    97  	workloadStatus SchedulerWorkerStatus
    98  	statusLock     sync.RWMutex
    99  
   100  	pauseFlag bool
   101  	pauseLock sync.Mutex
   102  	pauseCond *sync.Cond
   103  	ctx       context.Context
   104  	cancelFn  context.CancelFunc
   105  
   106  	// the Server.Config.EnabledSchedulers value is not safe for concurrent access, so
   107  	// the worker needs a cached copy of it. Workers are stopped if this value changes.
   108  	enabledSchedulers []string
   109  
   110  	// failures is the count of errors encountered while dequeueing evaluations
   111  	// and is used to calculate backoff.
   112  	failures  uint
   113  	evalToken string
   114  
   115  	// snapshotIndex is the index of the snapshot in which the scheduler was
   116  	// first invoked. It is used to mark the SnapshotIndex of evaluations
   117  	// Created, Updated or Reblocked.
   118  	snapshotIndex uint64
   119  }
   120  
   121  // NewWorker starts a new scheduler worker associated with the given server
   122  func NewWorker(ctx context.Context, srv *Server, args SchedulerWorkerPoolArgs) (*Worker, error) {
   123  	w := newWorker(ctx, srv, args)
   124  	w.Start()
   125  	return w, nil
   126  }
   127  
   128  // _newWorker creates a worker without calling its Start func. This is useful for testing.
   129  func newWorker(ctx context.Context, srv *Server, args SchedulerWorkerPoolArgs) *Worker {
   130  	w := &Worker{
   131  		id:                uuid.Generate(),
   132  		srv:               srv,
   133  		start:             time.Now(),
   134  		status:            WorkerStarting,
   135  		enabledSchedulers: make([]string, len(args.EnabledSchedulers)),
   136  	}
   137  	copy(w.enabledSchedulers, args.EnabledSchedulers)
   138  
   139  	w.logger = srv.logger.ResetNamed("worker").With("worker_id", w.id)
   140  	w.pauseCond = sync.NewCond(&w.pauseLock)
   141  	w.ctx, w.cancelFn = context.WithCancel(ctx)
   142  
   143  	return w
   144  }
   145  
   146  // ID returns a string ID for the worker.
   147  func (w *Worker) ID() string {
   148  	return w.id
   149  }
   150  
   151  // Start transitions a worker to the starting state. Check
   152  // to see if it paused using IsStarted()
   153  func (w *Worker) Start() {
   154  	w.setStatus(WorkerStarting)
   155  	go w.run(raftSyncLimit)
   156  }
   157  
   158  // Pause transitions a worker to the pausing state. Check
   159  // to see if it paused using IsPaused()
   160  func (w *Worker) Pause() {
   161  	if w.isPausable() {
   162  		w.setStatus(WorkerPausing)
   163  		w.setPauseFlag(true)
   164  	}
   165  }
   166  
   167  // Resume transitions a worker to the resuming state. Check
   168  // to see if the worker restarted by calling IsStarted()
   169  func (w *Worker) Resume() {
   170  	if w.IsPaused() {
   171  		w.setStatus(WorkerResuming)
   172  		w.setPauseFlag(false)
   173  		w.pauseCond.Broadcast()
   174  	}
   175  }
   176  
   177  // Resume transitions a worker to the stopping state. Check
   178  // to see if the worker stopped by calling IsStopped()
   179  func (w *Worker) Stop() {
   180  	w.setStatus(WorkerStopping)
   181  	w.shutdown()
   182  }
   183  
   184  // IsStarted returns a boolean indicating if this worker has been started.
   185  func (w *Worker) IsStarted() bool {
   186  	return w.GetStatus() == WorkerStarted
   187  }
   188  
   189  // IsPaused returns a boolean indicating if this worker has been paused.
   190  func (w *Worker) IsPaused() bool {
   191  	return w.GetStatus() == WorkerPaused
   192  }
   193  
   194  // IsStopped returns a boolean indicating if this worker has been stopped.
   195  func (w *Worker) IsStopped() bool {
   196  	return w.GetStatus() == WorkerStopped
   197  }
   198  
   199  func (w *Worker) isPausable() bool {
   200  	w.statusLock.RLock()
   201  	defer w.statusLock.RUnlock()
   202  	switch w.status {
   203  	case WorkerPausing, WorkerPaused, WorkerStopping, WorkerStopped:
   204  		return false
   205  	default:
   206  		return true
   207  	}
   208  }
   209  
   210  // GetStatus returns the status of the Worker
   211  func (w *Worker) GetStatus() WorkerStatus {
   212  	w.statusLock.RLock()
   213  	defer w.statusLock.RUnlock()
   214  	return w.status
   215  }
   216  
   217  // setStatuses is used internally to the worker to update the
   218  // status of the worker and workload at one time, since some
   219  // transitions need to update both values using the same lock.
   220  func (w *Worker) setStatuses(newWorkerStatus WorkerStatus, newWorkloadStatus SchedulerWorkerStatus) {
   221  	w.statusLock.Lock()
   222  	defer w.statusLock.Unlock()
   223  	w.setWorkerStatusLocked(newWorkerStatus)
   224  	w.setWorkloadStatusLocked(newWorkloadStatus)
   225  }
   226  
   227  // setStatus is used internally to the worker to update the
   228  // status of the worker based on calls to the Worker API. For
   229  // atomically updating the scheduler status and the workload
   230  // status, use `setStatuses`.
   231  func (w *Worker) setStatus(newStatus WorkerStatus) {
   232  	w.statusLock.Lock()
   233  	defer w.statusLock.Unlock()
   234  	w.setWorkerStatusLocked(newStatus)
   235  }
   236  
   237  func (w *Worker) setWorkerStatusLocked(newStatus WorkerStatus) {
   238  	if newStatus == w.status {
   239  		return
   240  	}
   241  	w.logger.Trace("changed worker status", "from", w.status, "to", newStatus)
   242  	w.status = newStatus
   243  }
   244  
   245  // GetStatus returns the status of the Worker's Workload.
   246  func (w *Worker) GetWorkloadStatus() SchedulerWorkerStatus {
   247  	w.statusLock.RLock()
   248  	defer w.statusLock.RUnlock()
   249  	return w.workloadStatus
   250  }
   251  
   252  // setWorkloadStatus is used internally to the worker to update the
   253  // status of the worker based updates from the workload.
   254  func (w *Worker) setWorkloadStatus(newStatus SchedulerWorkerStatus) {
   255  	w.statusLock.Lock()
   256  	defer w.statusLock.Unlock()
   257  	w.setWorkloadStatusLocked(newStatus)
   258  }
   259  
   260  func (w *Worker) setWorkloadStatusLocked(newStatus SchedulerWorkerStatus) {
   261  	if newStatus == w.workloadStatus {
   262  		return
   263  	}
   264  	w.logger.Trace("changed workload status", "from", w.workloadStatus, "to", newStatus)
   265  	w.workloadStatus = newStatus
   266  }
   267  
   268  type WorkerInfo struct {
   269  	ID                string    `json:"id"`
   270  	EnabledSchedulers []string  `json:"enabled_schedulers"`
   271  	Started           time.Time `json:"started"`
   272  	Status            string    `json:"status"`
   273  	WorkloadStatus    string    `json:"workload_status"`
   274  }
   275  
   276  func (w WorkerInfo) Copy() WorkerInfo {
   277  	out := WorkerInfo{
   278  		ID:                w.ID,
   279  		EnabledSchedulers: make([]string, len(w.EnabledSchedulers)),
   280  		Started:           w.Started,
   281  		Status:            w.Status,
   282  		WorkloadStatus:    w.WorkloadStatus,
   283  	}
   284  	copy(out.EnabledSchedulers, w.EnabledSchedulers)
   285  	return out
   286  }
   287  
   288  func (w WorkerInfo) String() string {
   289  	// lazy implementation of WorkerInfo to string
   290  	out, _ := json.Marshal(w)
   291  	return string(out)
   292  }
   293  
   294  func (w *Worker) Info() WorkerInfo {
   295  	w.pauseLock.Lock()
   296  	defer w.pauseLock.Unlock()
   297  	out := WorkerInfo{
   298  		ID:                w.id,
   299  		Status:            w.status.String(),
   300  		WorkloadStatus:    w.workloadStatus.String(),
   301  		EnabledSchedulers: make([]string, len(w.enabledSchedulers)),
   302  	}
   303  	out.Started = w.start
   304  	copy(out.EnabledSchedulers, w.enabledSchedulers)
   305  	return out
   306  }
   307  
   308  // ----------------------------------
   309  //  Pause Implementation
   310  //    These functions are used to support the worker's pause behaviors.
   311  // ----------------------------------
   312  
   313  func (w *Worker) setPauseFlag(pause bool) {
   314  	w.pauseLock.Lock()
   315  	defer w.pauseLock.Unlock()
   316  	w.pauseFlag = pause
   317  }
   318  
   319  // maybeWait is responsible for making the transition from `pausing`
   320  // to `paused`, waiting, and then transitioning back to the running
   321  // values.
   322  func (w *Worker) maybeWait() {
   323  	w.pauseLock.Lock()
   324  	defer w.pauseLock.Unlock()
   325  
   326  	if !w.pauseFlag {
   327  		return
   328  	}
   329  
   330  	w.statusLock.Lock()
   331  	w.status = WorkerPaused
   332  	originalWorkloadStatus := w.workloadStatus
   333  	w.workloadStatus = WorkloadPaused
   334  	w.logger.Trace("changed workload status", "from", originalWorkloadStatus, "to", w.workloadStatus)
   335  
   336  	w.statusLock.Unlock()
   337  
   338  	for w.pauseFlag {
   339  		w.pauseCond.Wait()
   340  	}
   341  
   342  	w.statusLock.Lock()
   343  
   344  	w.logger.Trace("changed workload status", "from", w.workloadStatus, "to", originalWorkloadStatus)
   345  	w.workloadStatus = originalWorkloadStatus
   346  
   347  	// only reset the worker status if the worker is not resuming to stop the paused workload.
   348  	if w.status != WorkerStopping {
   349  		w.logger.Trace("changed worker status", "from", w.status, "to", WorkerStarted)
   350  		w.status = WorkerStarted
   351  	}
   352  	w.statusLock.Unlock()
   353  }
   354  
   355  // Shutdown is used to signal that the worker should shutdown.
   356  func (w *Worker) shutdown() {
   357  	w.pauseLock.Lock()
   358  	wasPaused := w.pauseFlag
   359  	w.pauseFlag = false
   360  	w.pauseLock.Unlock()
   361  
   362  	w.logger.Trace("shutdown request received")
   363  	w.cancelFn()
   364  	if wasPaused {
   365  		w.pauseCond.Broadcast()
   366  	}
   367  }
   368  
   369  // markStopped is used to mark the worker  and workload as stopped. It should be called in a
   370  // defer immediately upon entering the run() function.
   371  func (w *Worker) markStopped() {
   372  	w.setStatuses(WorkerStopped, WorkloadStopped)
   373  	w.logger.Debug("stopped")
   374  }
   375  
   376  func (w *Worker) workerShuttingDown() bool {
   377  	select {
   378  	case <-w.ctx.Done():
   379  		return true
   380  	default:
   381  		return false
   382  	}
   383  }
   384  
   385  // ----------------------------------
   386  //  Workload behavior code
   387  // ----------------------------------
   388  
   389  // run is the long-lived goroutine which is used to run the worker
   390  func (w *Worker) run(raftSyncLimit time.Duration) {
   391  	defer func() {
   392  		w.markStopped()
   393  	}()
   394  	w.setStatuses(WorkerStarted, WorkloadRunning)
   395  	w.logger.Debug("running")
   396  	for {
   397  		// Check to see if the context has been cancelled. Server shutdown and Shutdown()
   398  		// should do this.
   399  		if w.workerShuttingDown() {
   400  			return
   401  		}
   402  		// Dequeue a pending evaluation
   403  		eval, token, waitIndex, shutdown := w.dequeueEvaluation(dequeueTimeout)
   404  		if shutdown {
   405  			return
   406  		}
   407  
   408  		// since dequeue takes time, we could have shutdown the server after
   409  		// getting an eval that needs to be nacked before we exit. Explicitly
   410  		// check the server whether to allow this eval to be processed.
   411  		if w.srv.IsShutdown() {
   412  			w.logger.Warn("nacking eval because the server is shutting down",
   413  				"eval", log.Fmt("%#v", eval))
   414  			w.sendNack(eval, token)
   415  			return
   416  		}
   417  
   418  		// Wait for the raft log to catchup to the evaluation
   419  		w.setWorkloadStatus(WorkloadWaitingForRaft)
   420  		snap, err := w.snapshotMinIndex(waitIndex, raftSyncLimit)
   421  		if err != nil {
   422  			var timeoutErr ErrMinIndexDeadlineExceeded
   423  			if errors.As(err, &timeoutErr) {
   424  				w.logger.Warn("timeout waiting for Raft index required by eval",
   425  					"eval", eval.ID, "index", waitIndex, "timeout", raftSyncLimit)
   426  				w.sendNack(eval, token)
   427  
   428  				// Timing out above means this server is woefully behind the
   429  				// leader's index. This can happen when a new server is added to
   430  				// a cluster and must initially sync the cluster state.
   431  				// Backoff dequeuing another eval until there's some indication
   432  				// this server would be up to date enough to process it.
   433  				slowServerSyncLimit := 10 * raftSyncLimit
   434  				if _, err := w.snapshotMinIndex(waitIndex, slowServerSyncLimit); err != nil {
   435  					w.logger.Warn("server is unable to catch up to last eval's index", "error", err)
   436  				}
   437  
   438  			} else if errors.Is(err, context.Canceled) {
   439  				// If the server has shutdown while we're waiting, we'll get the
   440  				// Canceled error from the worker's context. We need to nack any
   441  				// dequeued evals before we exit.
   442  				w.logger.Warn("nacking eval because the server is shutting down", "eval", eval.ID)
   443  				w.sendNack(eval, token)
   444  				return
   445  			} else {
   446  				w.logger.Error("error waiting for Raft index", "error", err, "index", waitIndex)
   447  				w.sendNack(eval, token)
   448  			}
   449  
   450  			continue
   451  		}
   452  
   453  		// Invoke the scheduler to determine placements
   454  		w.setWorkloadStatus(WorkloadScheduling)
   455  		if err := w.invokeScheduler(snap, eval, token); err != nil {
   456  			w.logger.Error("error invoking scheduler", "error", err)
   457  			w.sendNack(eval, token)
   458  			continue
   459  		}
   460  
   461  		// Complete the evaluation
   462  		w.sendAck(eval, token)
   463  	}
   464  }
   465  
   466  // dequeueEvaluation is used to fetch the next ready evaluation.
   467  // This blocks until an evaluation is available or a timeout is reached.
   468  func (w *Worker) dequeueEvaluation(timeout time.Duration) (
   469  	eval *structs.Evaluation, token string, waitIndex uint64, shutdown bool) {
   470  	// Setup the request
   471  	req := structs.EvalDequeueRequest{
   472  		Schedulers:       w.enabledSchedulers,
   473  		Timeout:          timeout,
   474  		SchedulerVersion: scheduler.SchedulerVersion,
   475  		WriteRequest: structs.WriteRequest{
   476  			Region: w.srv.config.Region,
   477  		},
   478  	}
   479  	var resp structs.EvalDequeueResponse
   480  
   481  REQ:
   482  	// Wait inside this function if the worker is paused.
   483  	w.maybeWait()
   484  	// Immediately check to see if the worker has been shutdown.
   485  	if w.workerShuttingDown() {
   486  		return nil, "", 0, true
   487  	}
   488  
   489  	// Make a blocking RPC
   490  	start := time.Now()
   491  	w.setWorkloadStatus(WorkloadWaitingToDequeue)
   492  	err := w.srv.RPC("Eval.Dequeue", &req, &resp)
   493  	metrics.MeasureSince([]string{"nomad", "worker", "dequeue_eval"}, start)
   494  	if err != nil {
   495  		if time.Since(w.start) > dequeueErrGrace && !w.workerShuttingDown() {
   496  			w.logger.Error("failed to dequeue evaluation", "error", err)
   497  		}
   498  
   499  		// Adjust the backoff based on the error. If it is a scheduler version
   500  		// mismatch we increase the baseline.
   501  		base, limit := backoffBaselineFast, backoffLimitSlow
   502  		if strings.Contains(err.Error(), "calling scheduler version") {
   503  			base = backoffSchedulerVersionMismatch
   504  			limit = backoffSchedulerVersionMismatch
   505  		}
   506  
   507  		if w.backoffErr(base, limit) {
   508  			return nil, "", 0, true
   509  		}
   510  		goto REQ
   511  	}
   512  	w.backoffReset()
   513  
   514  	// Check if we got a response
   515  	if resp.Eval != nil {
   516  		w.logger.Debug("dequeued evaluation", "eval_id", resp.Eval.ID, "type", resp.Eval.Type, "namespace", resp.Eval.Namespace, "job_id", resp.Eval.JobID, "node_id", resp.Eval.NodeID, "triggered_by", resp.Eval.TriggeredBy)
   517  		return resp.Eval, resp.Token, resp.GetWaitIndex(), false
   518  	}
   519  
   520  	goto REQ
   521  }
   522  
   523  // sendAcknowledgement should not be called directly. Call `sendAck` or `sendNack` instead.
   524  // This function implements `ack`ing or `nack`ing the evaluation generally.
   525  // Any errors are logged but swallowed.
   526  func (w *Worker) sendAcknowledgement(eval *structs.Evaluation, token string, ack bool) {
   527  	defer metrics.MeasureSince([]string{"nomad", "worker", "send_ack"}, time.Now())
   528  	// Setup the request
   529  	req := structs.EvalAckRequest{
   530  		EvalID: eval.ID,
   531  		Token:  token,
   532  		WriteRequest: structs.WriteRequest{
   533  			Region: w.srv.config.Region,
   534  		},
   535  	}
   536  	var resp structs.GenericResponse
   537  
   538  	// Determine if this is an Ack or Nack
   539  	verb := "ack"
   540  	endpoint := "Eval.Ack"
   541  	if !ack {
   542  		verb = "nack"
   543  		endpoint = "Eval.Nack"
   544  	}
   545  
   546  	// Make the RPC call
   547  	err := w.srv.RPC(endpoint, &req, &resp)
   548  	if err != nil {
   549  		w.logger.Error(fmt.Sprintf("failed to %s evaluation", verb), "eval_id", eval.ID, "error", err)
   550  	} else {
   551  		w.logger.Debug(fmt.Sprintf("%s evaluation", verb), "eval_id", eval.ID, "type", eval.Type, "namespace", eval.Namespace, "job_id", eval.JobID, "node_id", eval.NodeID, "triggered_by", eval.TriggeredBy)
   552  	}
   553  }
   554  
   555  // sendNack makes a best effort to nack the evaluation.
   556  // Any errors are logged but swallowed.
   557  func (w *Worker) sendNack(eval *structs.Evaluation, token string) {
   558  	w.sendAcknowledgement(eval, token, false)
   559  }
   560  
   561  // sendAck makes a best effort to ack the evaluation.
   562  // Any errors are logged but swallowed.
   563  func (w *Worker) sendAck(eval *structs.Evaluation, token string) {
   564  	w.sendAcknowledgement(eval, token, true)
   565  }
   566  
   567  type ErrMinIndexDeadlineExceeded struct {
   568  	waitIndex uint64
   569  	timeout   time.Duration
   570  }
   571  
   572  // Unwrapping an ErrMinIndexDeadlineExceeded always return
   573  // context.DeadlineExceeded
   574  func (ErrMinIndexDeadlineExceeded) Unwrap() error {
   575  	return context.DeadlineExceeded
   576  }
   577  
   578  func (e ErrMinIndexDeadlineExceeded) Error() string {
   579  	return fmt.Sprintf("timed out after %s waiting for index=%d", e.timeout, e.waitIndex)
   580  }
   581  
   582  // snapshotMinIndex times calls to StateStore.SnapshotAfter which may block.
   583  func (w *Worker) snapshotMinIndex(waitIndex uint64, timeout time.Duration) (*state.StateSnapshot, error) {
   584  	defer metrics.MeasureSince([]string{"nomad", "worker", "wait_for_index"}, time.Now())
   585  
   586  	ctx, cancel := context.WithTimeout(w.ctx, timeout)
   587  	snap, err := w.srv.fsm.State().SnapshotMinIndex(ctx, waitIndex)
   588  	cancel()
   589  
   590  	// Wrap error to ensure callers can detect timeouts.
   591  	if errors.Is(err, context.DeadlineExceeded) {
   592  		return nil, ErrMinIndexDeadlineExceeded{
   593  			waitIndex: waitIndex,
   594  			timeout:   timeout,
   595  		}
   596  	}
   597  
   598  	return snap, err
   599  }
   600  
   601  // invokeScheduler is used to invoke the business logic of the scheduler
   602  func (w *Worker) invokeScheduler(snap *state.StateSnapshot, eval *structs.Evaluation, token string) error {
   603  	defer metrics.MeasureSince([]string{"nomad", "worker", "invoke_scheduler", eval.Type}, time.Now())
   604  	// Store the evaluation token
   605  	w.evalToken = token
   606  
   607  	// Store the snapshot's index
   608  	var err error
   609  	w.snapshotIndex, err = snap.LatestIndex()
   610  	if err != nil {
   611  		return fmt.Errorf("failed to determine snapshot's index: %v", err)
   612  	}
   613  
   614  	// Create the scheduler, or use the special core scheduler
   615  	var sched scheduler.Scheduler
   616  	if eval.Type == structs.JobTypeCore {
   617  		sched = NewCoreScheduler(w.srv, snap)
   618  	} else {
   619  		sched, err = scheduler.NewScheduler(eval.Type, w.logger, w.srv.workersEventCh, snap, w)
   620  		if err != nil {
   621  			return fmt.Errorf("failed to instantiate scheduler: %v", err)
   622  		}
   623  	}
   624  
   625  	// Process the evaluation
   626  	err = sched.Process(eval)
   627  	if err != nil {
   628  		return fmt.Errorf("failed to process evaluation: %v", err)
   629  	}
   630  	return nil
   631  }
   632  
   633  // ServersMeetMinimumVersion allows implementations of the Scheduler interface in
   634  // other packages to perform server version checks without direct references to
   635  // the Nomad server.
   636  func (w *Worker) ServersMeetMinimumVersion(minVersion *version.Version, checkFailedServers bool) bool {
   637  	return ServersMeetMinimumVersion(w.srv.Members(), w.srv.Region(), minVersion, checkFailedServers)
   638  }
   639  
   640  // SubmitPlan is used to submit a plan for consideration. This allows
   641  // the worker to act as the planner for the scheduler.
   642  func (w *Worker) SubmitPlan(plan *structs.Plan) (*structs.PlanResult, scheduler.State, error) {
   643  	// Check for a shutdown before plan submission. Checking server state rather than
   644  	// worker state to allow work in flight to complete before stopping.
   645  	if w.srv.IsShutdown() {
   646  		return nil, nil, fmt.Errorf("shutdown while planning")
   647  	}
   648  	defer metrics.MeasureSince([]string{"nomad", "worker", "submit_plan"}, time.Now())
   649  
   650  	// Add the evaluation token to the plan
   651  	plan.EvalToken = w.evalToken
   652  
   653  	// Add SnapshotIndex to ensure leader's StateStore processes the Plan
   654  	// at or after the index it was created.
   655  	plan.SnapshotIndex = w.snapshotIndex
   656  
   657  	// Normalize stopped and preempted allocs before RPC
   658  	normalizePlan := ServersMeetMinimumVersion(w.srv.Members(), w.srv.Region(), MinVersionPlanNormalization, true)
   659  	if normalizePlan {
   660  		plan.NormalizeAllocations()
   661  	}
   662  
   663  	// Setup the request
   664  	req := structs.PlanRequest{
   665  		Plan: plan,
   666  		WriteRequest: structs.WriteRequest{
   667  			Region: w.srv.config.Region,
   668  		},
   669  	}
   670  	var resp structs.PlanResponse
   671  
   672  SUBMIT:
   673  	// Make the RPC call
   674  	if err := w.srv.RPC("Plan.Submit", &req, &resp); err != nil {
   675  		w.logger.Error("failed to submit plan for evaluation", "eval_id", plan.EvalID, "error", err)
   676  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   677  			goto SUBMIT
   678  		}
   679  		return nil, nil, err
   680  	} else {
   681  		w.logger.Debug("submitted plan for evaluation", "eval_id", plan.EvalID)
   682  		w.backoffReset()
   683  	}
   684  
   685  	// Look for a result
   686  	result := resp.Result
   687  	if result == nil {
   688  		return nil, nil, fmt.Errorf("missing result")
   689  	}
   690  
   691  	// Check if a state update is required. This could be required if we
   692  	// planned based on stale data, which is causing issues. For example, a
   693  	// node failure since the time we've started planning or conflicting task
   694  	// allocations.
   695  	var state scheduler.State
   696  	if result.RefreshIndex != 0 {
   697  		// Wait for the raft log to catchup to the evaluation
   698  		w.logger.Debug("refreshing state", "refresh_index", result.RefreshIndex, "eval_id", plan.EvalID)
   699  
   700  		var err error
   701  		state, err = w.snapshotMinIndex(result.RefreshIndex, raftSyncLimit)
   702  		if err != nil {
   703  			return nil, nil, err
   704  		}
   705  	}
   706  
   707  	// Return the result and potential state update
   708  	return result, state, nil
   709  }
   710  
   711  // UpdateEval is used to submit an updated evaluation. This allows
   712  // the worker to act as the planner for the scheduler.
   713  func (w *Worker) UpdateEval(eval *structs.Evaluation) error {
   714  	// Check for a shutdown before plan submission. Checking server state rather than
   715  	// worker state to allow a workers work in flight to complete before stopping.
   716  	if w.srv.IsShutdown() {
   717  		return fmt.Errorf("shutdown while planning")
   718  	}
   719  	defer metrics.MeasureSince([]string{"nomad", "worker", "update_eval"}, time.Now())
   720  
   721  	// Store the snapshot index in the eval
   722  	eval.SnapshotIndex = w.snapshotIndex
   723  	eval.UpdateModifyTime()
   724  
   725  	// Setup the request
   726  	req := structs.EvalUpdateRequest{
   727  		Evals:     []*structs.Evaluation{eval},
   728  		EvalToken: w.evalToken,
   729  		WriteRequest: structs.WriteRequest{
   730  			Region: w.srv.config.Region,
   731  		},
   732  	}
   733  	var resp structs.GenericResponse
   734  
   735  SUBMIT:
   736  	// Make the RPC call
   737  	if err := w.srv.RPC("Eval.Update", &req, &resp); err != nil {
   738  		w.logger.Error("failed to update evaluation", "eval", log.Fmt("%#v", eval), "error", err)
   739  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   740  			goto SUBMIT
   741  		}
   742  		return err
   743  	} else {
   744  		w.logger.Debug("updated evaluation", "eval", log.Fmt("%#v", eval))
   745  		w.backoffReset()
   746  	}
   747  	return nil
   748  }
   749  
   750  // CreateEval is used to create a new evaluation. This allows
   751  // the worker to act as the planner for the scheduler.
   752  func (w *Worker) CreateEval(eval *structs.Evaluation) error {
   753  	// Check for a shutdown before plan submission. This consults the server Shutdown state
   754  	// instead of the worker's to prevent aborting work in flight.
   755  	if w.srv.IsShutdown() {
   756  		return fmt.Errorf("shutdown while planning")
   757  	}
   758  	defer metrics.MeasureSince([]string{"nomad", "worker", "create_eval"}, time.Now())
   759  
   760  	// Store the snapshot index in the eval
   761  	eval.SnapshotIndex = w.snapshotIndex
   762  
   763  	now := time.Now().UTC().UnixNano()
   764  	eval.CreateTime = now
   765  	eval.ModifyTime = now
   766  
   767  	// Setup the request
   768  	req := structs.EvalUpdateRequest{
   769  		Evals:     []*structs.Evaluation{eval},
   770  		EvalToken: w.evalToken,
   771  		WriteRequest: structs.WriteRequest{
   772  			Region: w.srv.config.Region,
   773  		},
   774  	}
   775  	var resp structs.GenericResponse
   776  
   777  SUBMIT:
   778  	// Make the RPC call
   779  	if err := w.srv.RPC("Eval.Create", &req, &resp); err != nil {
   780  		w.logger.Error("failed to create evaluation", "eval", log.Fmt("%#v", eval), "error", err)
   781  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   782  			goto SUBMIT
   783  		}
   784  		return err
   785  	} else {
   786  		w.logger.Debug("created evaluation", "eval", log.Fmt("%#v", eval))
   787  		w.backoffReset()
   788  	}
   789  	return nil
   790  }
   791  
   792  // ReblockEval is used to reinsert a blocked evaluation into the blocked eval
   793  // tracker. This allows the worker to act as the planner for the scheduler.
   794  func (w *Worker) ReblockEval(eval *structs.Evaluation) error {
   795  	// Check for a shutdown before plan submission. This checks the server state rather than
   796  	// the worker's to prevent erroring on work in flight that would complete otherwise.
   797  	if w.srv.IsShutdown() {
   798  		return fmt.Errorf("shutdown while planning")
   799  	}
   800  	defer metrics.MeasureSince([]string{"nomad", "worker", "reblock_eval"}, time.Now())
   801  
   802  	// Update the evaluation if the queued jobs is not same as what is
   803  	// recorded in the job summary
   804  	ws := memdb.NewWatchSet()
   805  	summary, err := w.srv.fsm.state.JobSummaryByID(ws, eval.Namespace, eval.JobID)
   806  	if err != nil {
   807  		return fmt.Errorf("couldn't retrieve job summary: %v", err)
   808  	}
   809  	if summary != nil {
   810  		var hasChanged bool
   811  		for tg, summary := range summary.Summary {
   812  			if queued, ok := eval.QueuedAllocations[tg]; ok {
   813  				if queued != summary.Queued {
   814  					hasChanged = true
   815  					break
   816  				}
   817  			}
   818  		}
   819  		if hasChanged {
   820  			if err := w.UpdateEval(eval); err != nil {
   821  				return err
   822  			}
   823  		}
   824  	}
   825  
   826  	// Store the snapshot index in the eval
   827  	eval.SnapshotIndex = w.snapshotIndex
   828  	eval.UpdateModifyTime()
   829  
   830  	// Setup the request
   831  	req := structs.EvalUpdateRequest{
   832  		Evals:     []*structs.Evaluation{eval},
   833  		EvalToken: w.evalToken,
   834  		WriteRequest: structs.WriteRequest{
   835  			Region: w.srv.config.Region,
   836  		},
   837  	}
   838  	var resp structs.GenericResponse
   839  
   840  SUBMIT:
   841  	// Make the RPC call
   842  	if err := w.srv.RPC("Eval.Reblock", &req, &resp); err != nil {
   843  		w.logger.Error("failed to reblock evaluation", "eval", log.Fmt("%#v", eval), "error", err)
   844  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   845  			goto SUBMIT
   846  		}
   847  		return err
   848  	} else {
   849  		w.logger.Debug("reblocked evaluation", "eval", log.Fmt("%#v", eval))
   850  		w.backoffReset()
   851  	}
   852  	return nil
   853  }
   854  
   855  // shouldResubmit checks if a given error should be swallowed and the plan
   856  // resubmitted after a backoff. Usually these are transient errors that
   857  // the cluster should heal from quickly.
   858  func (w *Worker) shouldResubmit(err error) bool {
   859  	s := err.Error()
   860  	switch {
   861  	case strings.Contains(s, "No cluster leader"):
   862  		return true
   863  	case strings.Contains(s, "plan queue is disabled"):
   864  		return true
   865  	default:
   866  		return false
   867  	}
   868  }
   869  
   870  // backoffErr is used to do an exponential back off on error. This is
   871  // maintained statefully for the worker. Returns if attempts should be
   872  // abandoned due to shutdown.
   873  // This uses the worker's context in order to immediately stop the
   874  // backoff if the server or the worker is shutdown.
   875  func (w *Worker) backoffErr(base, limit time.Duration) bool {
   876  	w.setWorkloadStatus(WorkloadBackoff)
   877  	backoff := (1 << (2 * w.failures)) * base
   878  	if backoff > limit {
   879  		backoff = limit
   880  	} else {
   881  		w.failures++
   882  	}
   883  	select {
   884  	case <-time.After(backoff):
   885  		return false
   886  	case <-w.ctx.Done():
   887  		return true
   888  	}
   889  }
   890  
   891  // backoffReset is used to reset the failure count for
   892  // exponential backoff
   893  func (w *Worker) backoffReset() {
   894  	w.failures = 0
   895  }