github.com/uchennaokeke444/nomad@v0.11.8/nomad/worker.go (about)

     1  package nomad
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"strings"
     7  	"sync"
     8  	"time"
     9  
    10  	metrics "github.com/armon/go-metrics"
    11  	log "github.com/hashicorp/go-hclog"
    12  	memdb "github.com/hashicorp/go-memdb"
    13  	"github.com/hashicorp/nomad/nomad/state"
    14  	"github.com/hashicorp/nomad/nomad/structs"
    15  	"github.com/hashicorp/nomad/scheduler"
    16  )
    17  
    18  const (
    19  	// backoffBaselineFast is the baseline time for exponential backoff
    20  	backoffBaselineFast = 20 * time.Millisecond
    21  
    22  	// backoffBaselineSlow is the baseline time for exponential backoff
    23  	// but that is much slower than backoffBaselineFast
    24  	backoffBaselineSlow = 500 * time.Millisecond
    25  
    26  	// backoffLimitSlow is the limit of the exponential backoff for
    27  	// the slower backoff
    28  	backoffLimitSlow = 10 * time.Second
    29  
    30  	// backoffSchedulerVersionMismatch is the backoff between retries when the
    31  	// scheduler version mismatches that of the leader.
    32  	backoffSchedulerVersionMismatch = 30 * time.Second
    33  
    34  	// dequeueTimeout is used to timeout an evaluation dequeue so that
    35  	// we can check if there is a shutdown event
    36  	dequeueTimeout = 500 * time.Millisecond
    37  
    38  	// raftSyncLimit is the limit of time we will wait for Raft replication
    39  	// to catch up to the evaluation. This is used to fast Nack and
    40  	// allow another scheduler to pick it up.
    41  	raftSyncLimit = 5 * time.Second
    42  
    43  	// dequeueErrGrace is the grace period where we don't log about
    44  	// dequeue errors after start. This is to improve the user experience
    45  	// in dev mode where the leader isn't elected for a few seconds.
    46  	dequeueErrGrace = 10 * time.Second
    47  )
    48  
    49  // Worker is a single threaded scheduling worker. There may be multiple
    50  // running per server (leader or follower). They are responsible for dequeuing
    51  // pending evaluations, invoking schedulers, plan submission and the
    52  // lifecycle around making task allocations. They bridge the business logic
    53  // of the scheduler with the plumbing required to make it all work.
    54  type Worker struct {
    55  	srv    *Server
    56  	logger log.Logger
    57  	start  time.Time
    58  
    59  	paused    bool
    60  	pauseLock sync.Mutex
    61  	pauseCond *sync.Cond
    62  
    63  	failures uint
    64  
    65  	evalToken string
    66  
    67  	// snapshotIndex is the index of the snapshot in which the scheduler was
    68  	// first invoked. It is used to mark the SnapshotIndex of evaluations
    69  	// Created, Updated or Reblocked.
    70  	snapshotIndex uint64
    71  }
    72  
    73  // NewWorker starts a new worker associated with the given server
    74  func NewWorker(srv *Server) (*Worker, error) {
    75  	w := &Worker{
    76  		srv:    srv,
    77  		logger: srv.logger.ResetNamed("worker"),
    78  		start:  time.Now(),
    79  	}
    80  	w.pauseCond = sync.NewCond(&w.pauseLock)
    81  	go w.run()
    82  	return w, nil
    83  }
    84  
    85  // SetPause is used to pause or unpause a worker
    86  func (w *Worker) SetPause(p bool) {
    87  	w.pauseLock.Lock()
    88  	w.paused = p
    89  	w.pauseLock.Unlock()
    90  	if !p {
    91  		w.pauseCond.Broadcast()
    92  	}
    93  }
    94  
    95  // checkPaused is used to park the worker when paused
    96  func (w *Worker) checkPaused() {
    97  	w.pauseLock.Lock()
    98  	for w.paused {
    99  		w.pauseCond.Wait()
   100  	}
   101  	w.pauseLock.Unlock()
   102  }
   103  
   104  // run is the long-lived goroutine which is used to run the worker
   105  func (w *Worker) run() {
   106  	for {
   107  		// Dequeue a pending evaluation
   108  		eval, token, waitIndex, shutdown := w.dequeueEvaluation(dequeueTimeout)
   109  		if shutdown {
   110  			return
   111  		}
   112  
   113  		// Check for a shutdown
   114  		if w.srv.IsShutdown() {
   115  			w.logger.Error("nacking eval because the server is shutting down", "eval", log.Fmt("%#v", eval))
   116  			w.sendAck(eval.ID, token, false)
   117  			return
   118  		}
   119  
   120  		// Wait for the raft log to catchup to the evaluation
   121  		snap, err := w.snapshotMinIndex(waitIndex, raftSyncLimit)
   122  		if err != nil {
   123  			w.logger.Error("error waiting for Raft index", "error", err, "index", waitIndex)
   124  			w.sendAck(eval.ID, token, false)
   125  			continue
   126  		}
   127  
   128  		// Invoke the scheduler to determine placements
   129  		if err := w.invokeScheduler(snap, eval, token); err != nil {
   130  			w.logger.Error("error invoking scheduler", "error", err)
   131  			w.sendAck(eval.ID, token, false)
   132  			continue
   133  		}
   134  
   135  		// Complete the evaluation
   136  		w.sendAck(eval.ID, token, true)
   137  	}
   138  }
   139  
   140  // dequeueEvaluation is used to fetch the next ready evaluation.
   141  // This blocks until an evaluation is available or a timeout is reached.
   142  func (w *Worker) dequeueEvaluation(timeout time.Duration) (
   143  	eval *structs.Evaluation, token string, waitIndex uint64, shutdown bool) {
   144  	// Setup the request
   145  	req := structs.EvalDequeueRequest{
   146  		Schedulers:       w.srv.config.EnabledSchedulers,
   147  		Timeout:          timeout,
   148  		SchedulerVersion: scheduler.SchedulerVersion,
   149  		WriteRequest: structs.WriteRequest{
   150  			Region: w.srv.config.Region,
   151  		},
   152  	}
   153  	var resp structs.EvalDequeueResponse
   154  
   155  REQ:
   156  	// Check if we are paused
   157  	w.checkPaused()
   158  
   159  	// Make a blocking RPC
   160  	start := time.Now()
   161  	err := w.srv.RPC("Eval.Dequeue", &req, &resp)
   162  	metrics.MeasureSince([]string{"nomad", "worker", "dequeue_eval"}, start)
   163  	if err != nil {
   164  		if time.Since(w.start) > dequeueErrGrace && !w.srv.IsShutdown() {
   165  			w.logger.Error("failed to dequeue evaluation", "error", err)
   166  		}
   167  
   168  		// Adjust the backoff based on the error. If it is a scheduler version
   169  		// mismatch we increase the baseline.
   170  		base, limit := backoffBaselineFast, backoffLimitSlow
   171  		if strings.Contains(err.Error(), "calling scheduler version") {
   172  			base = backoffSchedulerVersionMismatch
   173  			limit = backoffSchedulerVersionMismatch
   174  		}
   175  
   176  		if w.backoffErr(base, limit) {
   177  			return nil, "", 0, true
   178  		}
   179  		goto REQ
   180  	}
   181  	w.backoffReset()
   182  
   183  	// Check if we got a response
   184  	if resp.Eval != nil {
   185  		w.logger.Debug("dequeued evaluation", "eval_id", resp.Eval.ID)
   186  		return resp.Eval, resp.Token, resp.GetWaitIndex(), false
   187  	}
   188  
   189  	// Check for potential shutdown
   190  	if w.srv.IsShutdown() {
   191  		return nil, "", 0, true
   192  	}
   193  	goto REQ
   194  }
   195  
   196  // sendAck makes a best effort to ack or nack the evaluation.
   197  // Any errors are logged but swallowed.
   198  func (w *Worker) sendAck(evalID, token string, ack bool) {
   199  	defer metrics.MeasureSince([]string{"nomad", "worker", "send_ack"}, time.Now())
   200  	// Setup the request
   201  	req := structs.EvalAckRequest{
   202  		EvalID: evalID,
   203  		Token:  token,
   204  		WriteRequest: structs.WriteRequest{
   205  			Region: w.srv.config.Region,
   206  		},
   207  	}
   208  	var resp structs.GenericResponse
   209  
   210  	// Determine if this is an Ack or Nack
   211  	verb := "ack"
   212  	endpoint := "Eval.Ack"
   213  	if !ack {
   214  		verb = "nack"
   215  		endpoint = "Eval.Nack"
   216  	}
   217  
   218  	// Make the RPC call
   219  	err := w.srv.RPC(endpoint, &req, &resp)
   220  	if err != nil {
   221  		w.logger.Error(fmt.Sprintf("failed to %s evaluation", verb), "eval_id", evalID, "error", err)
   222  	} else {
   223  		w.logger.Debug(fmt.Sprintf("%s evaluation", verb), "eval_id", evalID)
   224  	}
   225  }
   226  
   227  // snapshotMinIndex times calls to StateStore.SnapshotAfter which may block.
   228  func (w *Worker) snapshotMinIndex(waitIndex uint64, timeout time.Duration) (*state.StateSnapshot, error) {
   229  	start := time.Now()
   230  	ctx, cancel := context.WithTimeout(w.srv.shutdownCtx, timeout)
   231  	snap, err := w.srv.fsm.State().SnapshotMinIndex(ctx, waitIndex)
   232  	cancel()
   233  	metrics.MeasureSince([]string{"nomad", "worker", "wait_for_index"}, start)
   234  
   235  	// Wrap error to ensure callers don't disregard timeouts.
   236  	if err == context.DeadlineExceeded {
   237  		err = fmt.Errorf("timed out after %s waiting for index=%d", timeout, waitIndex)
   238  	}
   239  
   240  	return snap, err
   241  }
   242  
   243  // invokeScheduler is used to invoke the business logic of the scheduler
   244  func (w *Worker) invokeScheduler(snap *state.StateSnapshot, eval *structs.Evaluation, token string) error {
   245  	defer metrics.MeasureSince([]string{"nomad", "worker", "invoke_scheduler", eval.Type}, time.Now())
   246  	// Store the evaluation token
   247  	w.evalToken = token
   248  
   249  	// Store the snapshot's index
   250  	var err error
   251  	w.snapshotIndex, err = snap.LatestIndex()
   252  	if err != nil {
   253  		return fmt.Errorf("failed to determine snapshot's index: %v", err)
   254  	}
   255  
   256  	// Create the scheduler, or use the special core scheduler
   257  	var sched scheduler.Scheduler
   258  	if eval.Type == structs.JobTypeCore {
   259  		sched = NewCoreScheduler(w.srv, snap)
   260  	} else {
   261  		sched, err = scheduler.NewScheduler(eval.Type, w.logger, snap, w)
   262  		if err != nil {
   263  			return fmt.Errorf("failed to instantiate scheduler: %v", err)
   264  		}
   265  	}
   266  
   267  	// Process the evaluation
   268  	err = sched.Process(eval)
   269  	if err != nil {
   270  		return fmt.Errorf("failed to process evaluation: %v", err)
   271  	}
   272  	return nil
   273  }
   274  
   275  // SubmitPlan is used to submit a plan for consideration. This allows
   276  // the worker to act as the planner for the scheduler.
   277  func (w *Worker) SubmitPlan(plan *structs.Plan) (*structs.PlanResult, scheduler.State, error) {
   278  	// Check for a shutdown before plan submission
   279  	if w.srv.IsShutdown() {
   280  		return nil, nil, fmt.Errorf("shutdown while planning")
   281  	}
   282  	defer metrics.MeasureSince([]string{"nomad", "worker", "submit_plan"}, time.Now())
   283  
   284  	// Add the evaluation token to the plan
   285  	plan.EvalToken = w.evalToken
   286  
   287  	// Add SnapshotIndex to ensure leader's StateStore processes the Plan
   288  	// at or after the index it was created.
   289  	plan.SnapshotIndex = w.snapshotIndex
   290  
   291  	// Normalize stopped and preempted allocs before RPC
   292  	normalizePlan := ServersMeetMinimumVersion(w.srv.Members(), MinVersionPlanNormalization, true)
   293  	if normalizePlan {
   294  		plan.NormalizeAllocations()
   295  	}
   296  
   297  	// Setup the request
   298  	req := structs.PlanRequest{
   299  		Plan: plan,
   300  		WriteRequest: structs.WriteRequest{
   301  			Region: w.srv.config.Region,
   302  		},
   303  	}
   304  	var resp structs.PlanResponse
   305  
   306  SUBMIT:
   307  	// Make the RPC call
   308  	if err := w.srv.RPC("Plan.Submit", &req, &resp); err != nil {
   309  		w.logger.Error("failed to submit plan for evaluation", "eval_id", plan.EvalID, "error", err)
   310  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   311  			goto SUBMIT
   312  		}
   313  		return nil, nil, err
   314  	} else {
   315  		w.logger.Debug("submitted plan for evaluation", "eval_id", plan.EvalID)
   316  		w.backoffReset()
   317  	}
   318  
   319  	// Look for a result
   320  	result := resp.Result
   321  	if result == nil {
   322  		return nil, nil, fmt.Errorf("missing result")
   323  	}
   324  
   325  	// Check if a state update is required. This could be required if we
   326  	// planned based on stale data, which is causing issues. For example, a
   327  	// node failure since the time we've started planning or conflicting task
   328  	// allocations.
   329  	var state scheduler.State
   330  	if result.RefreshIndex != 0 {
   331  		// Wait for the raft log to catchup to the evaluation
   332  		w.logger.Debug("refreshing state", "refresh_index", result.RefreshIndex, "eval_id", plan.EvalID)
   333  
   334  		var err error
   335  		state, err = w.snapshotMinIndex(result.RefreshIndex, raftSyncLimit)
   336  		if err != nil {
   337  			return nil, nil, err
   338  		}
   339  	}
   340  
   341  	// Return the result and potential state update
   342  	return result, state, nil
   343  }
   344  
   345  // UpdateEval is used to submit an updated evaluation. This allows
   346  // the worker to act as the planner for the scheduler.
   347  func (w *Worker) UpdateEval(eval *structs.Evaluation) error {
   348  	// Check for a shutdown before plan submission
   349  	if w.srv.IsShutdown() {
   350  		return fmt.Errorf("shutdown while planning")
   351  	}
   352  	defer metrics.MeasureSince([]string{"nomad", "worker", "update_eval"}, time.Now())
   353  
   354  	// Store the snapshot index in the eval
   355  	eval.SnapshotIndex = w.snapshotIndex
   356  	eval.UpdateModifyTime()
   357  
   358  	// Setup the request
   359  	req := structs.EvalUpdateRequest{
   360  		Evals:     []*structs.Evaluation{eval},
   361  		EvalToken: w.evalToken,
   362  		WriteRequest: structs.WriteRequest{
   363  			Region: w.srv.config.Region,
   364  		},
   365  	}
   366  	var resp structs.GenericResponse
   367  
   368  SUBMIT:
   369  	// Make the RPC call
   370  	if err := w.srv.RPC("Eval.Update", &req, &resp); err != nil {
   371  		w.logger.Error("failed to update evaluation", "eval", log.Fmt("%#v", eval), "error", err)
   372  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   373  			goto SUBMIT
   374  		}
   375  		return err
   376  	} else {
   377  		w.logger.Debug("updated evaluation", "eval", log.Fmt("%#v", eval))
   378  		w.backoffReset()
   379  	}
   380  	return nil
   381  }
   382  
   383  // CreateEval is used to create a new evaluation. This allows
   384  // the worker to act as the planner for the scheduler.
   385  func (w *Worker) CreateEval(eval *structs.Evaluation) error {
   386  	// Check for a shutdown before plan submission
   387  	if w.srv.IsShutdown() {
   388  		return fmt.Errorf("shutdown while planning")
   389  	}
   390  	defer metrics.MeasureSince([]string{"nomad", "worker", "create_eval"}, time.Now())
   391  
   392  	// Store the snapshot index in the eval
   393  	eval.SnapshotIndex = w.snapshotIndex
   394  
   395  	now := time.Now().UTC().UnixNano()
   396  	eval.CreateTime = now
   397  	eval.ModifyTime = now
   398  
   399  	// Setup the request
   400  	req := structs.EvalUpdateRequest{
   401  		Evals:     []*structs.Evaluation{eval},
   402  		EvalToken: w.evalToken,
   403  		WriteRequest: structs.WriteRequest{
   404  			Region: w.srv.config.Region,
   405  		},
   406  	}
   407  	var resp structs.GenericResponse
   408  
   409  SUBMIT:
   410  	// Make the RPC call
   411  	if err := w.srv.RPC("Eval.Create", &req, &resp); err != nil {
   412  		w.logger.Error("failed to create evaluation", "eval", log.Fmt("%#v", eval), "error", err)
   413  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   414  			goto SUBMIT
   415  		}
   416  		return err
   417  	} else {
   418  		w.logger.Debug("created evaluation", "eval", log.Fmt("%#v", eval))
   419  		w.backoffReset()
   420  	}
   421  	return nil
   422  }
   423  
   424  // ReblockEval is used to reinsert a blocked evaluation into the blocked eval
   425  // tracker. This allows the worker to act as the planner for the scheduler.
   426  func (w *Worker) ReblockEval(eval *structs.Evaluation) error {
   427  	// Check for a shutdown before plan submission
   428  	if w.srv.IsShutdown() {
   429  		return fmt.Errorf("shutdown while planning")
   430  	}
   431  	defer metrics.MeasureSince([]string{"nomad", "worker", "reblock_eval"}, time.Now())
   432  
   433  	// Update the evaluation if the queued jobs is not same as what is
   434  	// recorded in the job summary
   435  	ws := memdb.NewWatchSet()
   436  	summary, err := w.srv.fsm.state.JobSummaryByID(ws, eval.Namespace, eval.JobID)
   437  	if err != nil {
   438  		return fmt.Errorf("couldn't retrieve job summary: %v", err)
   439  	}
   440  	if summary != nil {
   441  		var hasChanged bool
   442  		for tg, summary := range summary.Summary {
   443  			if queued, ok := eval.QueuedAllocations[tg]; ok {
   444  				if queued != summary.Queued {
   445  					hasChanged = true
   446  					break
   447  				}
   448  			}
   449  		}
   450  		if hasChanged {
   451  			if err := w.UpdateEval(eval); err != nil {
   452  				return err
   453  			}
   454  		}
   455  	}
   456  
   457  	// Store the snapshot index in the eval
   458  	eval.SnapshotIndex = w.snapshotIndex
   459  	eval.UpdateModifyTime()
   460  
   461  	// Setup the request
   462  	req := structs.EvalUpdateRequest{
   463  		Evals:     []*structs.Evaluation{eval},
   464  		EvalToken: w.evalToken,
   465  		WriteRequest: structs.WriteRequest{
   466  			Region: w.srv.config.Region,
   467  		},
   468  	}
   469  	var resp structs.GenericResponse
   470  
   471  SUBMIT:
   472  	// Make the RPC call
   473  	if err := w.srv.RPC("Eval.Reblock", &req, &resp); err != nil {
   474  		w.logger.Error("failed to reblock evaluation", "eval", log.Fmt("%#v", eval), "error", err)
   475  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   476  			goto SUBMIT
   477  		}
   478  		return err
   479  	} else {
   480  		w.logger.Debug("reblocked evaluation", "eval", log.Fmt("%#v", eval))
   481  		w.backoffReset()
   482  	}
   483  	return nil
   484  }
   485  
   486  // shouldResubmit checks if a given error should be swallowed and the plan
   487  // resubmitted after a backoff. Usually these are transient errors that
   488  // the cluster should heal from quickly.
   489  func (w *Worker) shouldResubmit(err error) bool {
   490  	s := err.Error()
   491  	switch {
   492  	case strings.Contains(s, "No cluster leader"):
   493  		return true
   494  	case strings.Contains(s, "plan queue is disabled"):
   495  		return true
   496  	default:
   497  		return false
   498  	}
   499  }
   500  
   501  // backoffErr is used to do an exponential back off on error. This is
   502  // maintained statefully for the worker. Returns if attempts should be
   503  // abandoned due to shutdown.
   504  func (w *Worker) backoffErr(base, limit time.Duration) bool {
   505  	backoff := (1 << (2 * w.failures)) * base
   506  	if backoff > limit {
   507  		backoff = limit
   508  	} else {
   509  		w.failures++
   510  	}
   511  	select {
   512  	case <-time.After(backoff):
   513  		return false
   514  	case <-w.srv.shutdownCh:
   515  		return true
   516  	}
   517  }
   518  
   519  // backoffReset is used to reset the failure count for
   520  // exponential backoff
   521  func (w *Worker) backoffReset() {
   522  	w.failures = 0
   523  }