github.com/emate/nomad@v0.8.2-wo-binpacking/nomad/worker.go

github.com/emate/nomad@v0.8.2-wo-binpacking/nomad/worker.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  	"strings"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/armon/go-metrics"
    11  	memdb "github.com/hashicorp/go-memdb"
    12  	"github.com/hashicorp/nomad/nomad/structs"
    13  	"github.com/hashicorp/nomad/scheduler"
    14  )
    15  
    16  const (
    17  	// backoffBaselineFast is the baseline time for exponential backoff
    18  	backoffBaselineFast = 20 * time.Millisecond
    19  
    20  	// backoffBaselineSlow is the baseline time for exponential backoff
    21  	// but that is much slower than backoffBaselineFast
    22  	backoffBaselineSlow = 500 * time.Millisecond
    23  
    24  	// backoffLimitFast is the limit of the exponential backoff
    25  	backoffLimitFast = time.Second
    26  
    27  	// backoffLimitSlow is the limit of the exponential backoff for
    28  	// the slower backoff
    29  	backoffLimitSlow = 10 * time.Second
    30  
    31  	// backoffSchedulerVersionMismatch is the backoff between retries when the
    32  	// scheduler version mismatches that of the leader.
    33  	backoffSchedulerVersionMismatch = 30 * time.Second
    34  
    35  	// dequeueTimeout is used to timeout an evaluation dequeue so that
    36  	// we can check if there is a shutdown event
    37  	dequeueTimeout = 500 * time.Millisecond
    38  
    39  	// raftSyncLimit is the limit of time we will wait for Raft replication
    40  	// to catch up to the evaluation. This is used to fast Nack and
    41  	// allow another scheduler to pick it up.
    42  	raftSyncLimit = 5 * time.Second
    43  
    44  	// dequeueErrGrace is the grace period where we don't log about
    45  	// dequeue errors after start. This is to improve the user experience
    46  	// in dev mode where the leader isn't elected for a few seconds.
    47  	dequeueErrGrace = 10 * time.Second
    48  )
    49  
    50  // Worker is a single threaded scheduling worker. There may be multiple
    51  // running per server (leader or follower). They are responsible for dequeuing
    52  // pending evaluations, invoking schedulers, plan submission and the
    53  // lifecycle around making task allocations. They bridge the business logic
    54  // of the scheduler with the plumbing required to make it all work.
    55  type Worker struct {
    56  	srv    *Server
    57  	logger *log.Logger
    58  	start  time.Time
    59  
    60  	paused    bool
    61  	pauseLock sync.Mutex
    62  	pauseCond *sync.Cond
    63  
    64  	failures uint
    65  
    66  	evalToken string
    67  
    68  	// snapshotIndex is the index of the snapshot in which the scheduler was
    69  	// first invoked. It is used to mark the SnapshotIndex of evaluations
    70  	// Created, Updated or Reblocked.
    71  	snapshotIndex uint64
    72  }
    73  
    74  // NewWorker starts a new worker associated with the given server
    75  func NewWorker(srv *Server) (*Worker, error) {
    76  	w := &Worker{
    77  		srv:    srv,
    78  		logger: srv.logger,
    79  		start:  time.Now(),
    80  	}
    81  	w.pauseCond = sync.NewCond(&w.pauseLock)
    82  	go w.run()
    83  	return w, nil
    84  }
    85  
    86  // SetPause is used to pause or unpause a worker
    87  func (w *Worker) SetPause(p bool) {
    88  	w.pauseLock.Lock()
    89  	w.paused = p
    90  	w.pauseLock.Unlock()
    91  	if !p {
    92  		w.pauseCond.Broadcast()
    93  	}
    94  }
    95  
    96  // checkPaused is used to park the worker when paused
    97  func (w *Worker) checkPaused() {
    98  	w.pauseLock.Lock()
    99  	for w.paused {
   100  		w.pauseCond.Wait()
   101  	}
   102  	w.pauseLock.Unlock()
   103  }
   104  
   105  // run is the long-lived goroutine which is used to run the worker
   106  func (w *Worker) run() {
   107  	for {
   108  		// Dequeue a pending evaluation
   109  		eval, token, waitIndex, shutdown := w.dequeueEvaluation(dequeueTimeout)
   110  		if shutdown {
   111  			return
   112  		}
   113  
   114  		// Check for a shutdown
   115  		if w.srv.IsShutdown() {
   116  			w.sendAck(eval.ID, token, false)
   117  			return
   118  		}
   119  
   120  		// Wait for the raft log to catchup to the evaluation
   121  		if err := w.waitForIndex(waitIndex, raftSyncLimit); err != nil {
   122  			w.sendAck(eval.ID, token, false)
   123  			continue
   124  		}
   125  
   126  		// Invoke the scheduler to determine placements
   127  		if err := w.invokeScheduler(eval, token); err != nil {
   128  			w.sendAck(eval.ID, token, false)
   129  			continue
   130  		}
   131  
   132  		// Complete the evaluation
   133  		w.sendAck(eval.ID, token, true)
   134  	}
   135  }
   136  
   137  // dequeueEvaluation is used to fetch the next ready evaluation.
   138  // This blocks until an evaluation is available or a timeout is reached.
   139  func (w *Worker) dequeueEvaluation(timeout time.Duration) (
   140  	eval *structs.Evaluation, token string, waitIndex uint64, shutdown bool) {
   141  	// Setup the request
   142  	req := structs.EvalDequeueRequest{
   143  		Schedulers:       w.srv.config.EnabledSchedulers,
   144  		Timeout:          timeout,
   145  		SchedulerVersion: scheduler.SchedulerVersion,
   146  		WriteRequest: structs.WriteRequest{
   147  			Region: w.srv.config.Region,
   148  		},
   149  	}
   150  	var resp structs.EvalDequeueResponse
   151  
   152  REQ:
   153  	// Check if we are paused
   154  	w.checkPaused()
   155  
   156  	// Make a blocking RPC
   157  	start := time.Now()
   158  	err := w.srv.RPC("Eval.Dequeue", &req, &resp)
   159  	metrics.MeasureSince([]string{"nomad", "worker", "dequeue_eval"}, start)
   160  	if err != nil {
   161  		if time.Since(w.start) > dequeueErrGrace && !w.srv.IsShutdown() {
   162  			w.logger.Printf("[ERR] worker: failed to dequeue evaluation: %v", err)
   163  		}
   164  
   165  		// Adjust the backoff based on the error. If it is a scheduler version
   166  		// mismatch we increase the baseline.
   167  		base, limit := backoffBaselineFast, backoffLimitSlow
   168  		if strings.Contains(err.Error(), "calling scheduler version") {
   169  			base = backoffSchedulerVersionMismatch
   170  			limit = backoffSchedulerVersionMismatch
   171  		}
   172  
   173  		if w.backoffErr(base, limit) {
   174  			return nil, "", 0, true
   175  		}
   176  		goto REQ
   177  	}
   178  	w.backoffReset()
   179  
   180  	// Check if we got a response
   181  	if resp.Eval != nil {
   182  		w.logger.Printf("[DEBUG] worker: dequeued evaluation %s", resp.Eval.ID)
   183  		return resp.Eval, resp.Token, resp.GetWaitIndex(), false
   184  	}
   185  
   186  	// Check for potential shutdown
   187  	if w.srv.IsShutdown() {
   188  		return nil, "", 0, true
   189  	}
   190  	goto REQ
   191  }
   192  
   193  // sendAck makes a best effort to ack or nack the evaluation.
   194  // Any errors are logged but swallowed.
   195  func (w *Worker) sendAck(evalID, token string, ack bool) {
   196  	defer metrics.MeasureSince([]string{"nomad", "worker", "send_ack"}, time.Now())
   197  	// Setup the request
   198  	req := structs.EvalAckRequest{
   199  		EvalID: evalID,
   200  		Token:  token,
   201  		WriteRequest: structs.WriteRequest{
   202  			Region: w.srv.config.Region,
   203  		},
   204  	}
   205  	var resp structs.GenericResponse
   206  
   207  	// Determine if this is an Ack or Nack
   208  	verb := "ack"
   209  	endpoint := "Eval.Ack"
   210  	if !ack {
   211  		verb = "nack"
   212  		endpoint = "Eval.Nack"
   213  	}
   214  
   215  	// Make the RPC call
   216  	err := w.srv.RPC(endpoint, &req, &resp)
   217  	if err != nil {
   218  		w.logger.Printf("[ERR] worker: failed to %s evaluation '%s': %v",
   219  			verb, evalID, err)
   220  	} else {
   221  		w.logger.Printf("[DEBUG] worker: %s for evaluation %s", verb, evalID)
   222  	}
   223  }
   224  
   225  // waitForIndex ensures that the local state is at least as fresh
   226  // as the given index. This is used before starting an evaluation,
   227  // but also potentially mid-stream. If a Plan fails because of stale
   228  // state (attempt to allocate to a failed/dead node), we may need
   229  // to sync our state again and do the planning with more recent data.
   230  func (w *Worker) waitForIndex(index uint64, timeout time.Duration) error {
   231  	// XXX: Potential optimization is to set up a watch on the state stores
   232  	// index table and only unblock via a trigger rather than timing out and
   233  	// checking.
   234  
   235  	start := time.Now()
   236  	defer metrics.MeasureSince([]string{"nomad", "worker", "wait_for_index"}, start)
   237  CHECK:
   238  	// Get the states current index
   239  	snapshotIndex, err := w.srv.fsm.State().LatestIndex()
   240  	if err != nil {
   241  		return fmt.Errorf("failed to determine state store's index: %v", err)
   242  	}
   243  
   244  	// We only need the FSM state to be as recent as the given index
   245  	if index <= snapshotIndex {
   246  		w.backoffReset()
   247  		return nil
   248  	}
   249  
   250  	// Check if we've reached our limit
   251  	if time.Now().Sub(start) > timeout {
   252  		return fmt.Errorf("sync wait timeout reached")
   253  	}
   254  
   255  	// Exponential back off if we haven't yet reached it
   256  	if w.backoffErr(backoffBaselineFast, backoffLimitFast) {
   257  		return fmt.Errorf("shutdown while waiting for state sync")
   258  	}
   259  	goto CHECK
   260  }
   261  
   262  // invokeScheduler is used to invoke the business logic of the scheduler
   263  func (w *Worker) invokeScheduler(eval *structs.Evaluation, token string) error {
   264  	defer metrics.MeasureSince([]string{"nomad", "worker", "invoke_scheduler", eval.Type}, time.Now())
   265  	// Store the evaluation token
   266  	w.evalToken = token
   267  
   268  	// Snapshot the current state
   269  	snap, err := w.srv.fsm.State().Snapshot()
   270  	if err != nil {
   271  		return fmt.Errorf("failed to snapshot state: %v", err)
   272  	}
   273  
   274  	// Store the snapshot's index
   275  	w.snapshotIndex, err = snap.LatestIndex()
   276  	if err != nil {
   277  		return fmt.Errorf("failed to determine snapshot's index: %v", err)
   278  	}
   279  
   280  	// Create the scheduler, or use the special system scheduler
   281  	var sched scheduler.Scheduler
   282  	if eval.Type == structs.JobTypeCore {
   283  		sched = NewCoreScheduler(w.srv, snap)
   284  	} else {
   285  		sched, err = scheduler.NewScheduler(eval.Type, w.logger, snap, w)
   286  		if err != nil {
   287  			return fmt.Errorf("failed to instantiate scheduler: %v", err)
   288  		}
   289  	}
   290  
   291  	// Process the evaluation
   292  	err = sched.Process(eval)
   293  	if err != nil {
   294  		return fmt.Errorf("failed to process evaluation: %v", err)
   295  	}
   296  	return nil
   297  }
   298  
   299  // SubmitPlan is used to submit a plan for consideration. This allows
   300  // the worker to act as the planner for the scheduler.
   301  func (w *Worker) SubmitPlan(plan *structs.Plan) (*structs.PlanResult, scheduler.State, error) {
   302  	// Check for a shutdown before plan submission
   303  	if w.srv.IsShutdown() {
   304  		return nil, nil, fmt.Errorf("shutdown while planning")
   305  	}
   306  	defer metrics.MeasureSince([]string{"nomad", "worker", "submit_plan"}, time.Now())
   307  
   308  	// Add the evaluation token to the plan
   309  	plan.EvalToken = w.evalToken
   310  
   311  	// Setup the request
   312  	req := structs.PlanRequest{
   313  		Plan: plan,
   314  		WriteRequest: structs.WriteRequest{
   315  			Region: w.srv.config.Region,
   316  		},
   317  	}
   318  	var resp structs.PlanResponse
   319  
   320  SUBMIT:
   321  	// Make the RPC call
   322  	if err := w.srv.RPC("Plan.Submit", &req, &resp); err != nil {
   323  		w.logger.Printf("[ERR] worker: failed to submit plan for evaluation %s: %v",
   324  			plan.EvalID, err)
   325  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   326  			goto SUBMIT
   327  		}
   328  		return nil, nil, err
   329  	} else {
   330  		w.logger.Printf("[DEBUG] worker: submitted plan at index %d for evaluation %s", resp.Index, plan.EvalID)
   331  		w.backoffReset()
   332  	}
   333  
   334  	// Look for a result
   335  	result := resp.Result
   336  	if result == nil {
   337  		return nil, nil, fmt.Errorf("missing result")
   338  	}
   339  
   340  	// Check if a state update is required. This could be required if we
   341  	// planning based on stale data, which is causing issues. For example, a
   342  	// node failure since the time we've started planning or conflicting task
   343  	// allocations.
   344  	var state scheduler.State
   345  	if result.RefreshIndex != 0 {
   346  		// Wait for the raft log to catchup to the evaluation
   347  		w.logger.Printf("[DEBUG] worker: refreshing state to index %d for %q", result.RefreshIndex, plan.EvalID)
   348  		if err := w.waitForIndex(result.RefreshIndex, raftSyncLimit); err != nil {
   349  			return nil, nil, err
   350  		}
   351  
   352  		// Snapshot the current state
   353  		snap, err := w.srv.fsm.State().Snapshot()
   354  		if err != nil {
   355  			return nil, nil, fmt.Errorf("failed to snapshot state: %v", err)
   356  		}
   357  		state = snap
   358  	}
   359  
   360  	// Return the result and potential state update
   361  	return result, state, nil
   362  }
   363  
   364  // UpdateEval is used to submit an updated evaluation. This allows
   365  // the worker to act as the planner for the scheduler.
   366  func (w *Worker) UpdateEval(eval *structs.Evaluation) error {
   367  	// Check for a shutdown before plan submission
   368  	if w.srv.IsShutdown() {
   369  		return fmt.Errorf("shutdown while planning")
   370  	}
   371  	defer metrics.MeasureSince([]string{"nomad", "worker", "update_eval"}, time.Now())
   372  
   373  	// Store the snapshot index in the eval
   374  	eval.SnapshotIndex = w.snapshotIndex
   375  
   376  	// Setup the request
   377  	req := structs.EvalUpdateRequest{
   378  		Evals:     []*structs.Evaluation{eval},
   379  		EvalToken: w.evalToken,
   380  		WriteRequest: structs.WriteRequest{
   381  			Region: w.srv.config.Region,
   382  		},
   383  	}
   384  	var resp structs.GenericResponse
   385  
   386  SUBMIT:
   387  	// Make the RPC call
   388  	if err := w.srv.RPC("Eval.Update", &req, &resp); err != nil {
   389  		w.logger.Printf("[ERR] worker: failed to update evaluation %#v: %v",
   390  			eval, err)
   391  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   392  			goto SUBMIT
   393  		}
   394  		return err
   395  	} else {
   396  		w.logger.Printf("[DEBUG] worker: updated evaluation %#v", eval)
   397  		w.backoffReset()
   398  	}
   399  	return nil
   400  }
   401  
   402  // CreateEval is used to create a new evaluation. This allows
   403  // the worker to act as the planner for the scheduler.
   404  func (w *Worker) CreateEval(eval *structs.Evaluation) error {
   405  	// Check for a shutdown before plan submission
   406  	if w.srv.IsShutdown() {
   407  		return fmt.Errorf("shutdown while planning")
   408  	}
   409  	defer metrics.MeasureSince([]string{"nomad", "worker", "create_eval"}, time.Now())
   410  
   411  	// Store the snapshot index in the eval
   412  	eval.SnapshotIndex = w.snapshotIndex
   413  
   414  	// Setup the request
   415  	req := structs.EvalUpdateRequest{
   416  		Evals:     []*structs.Evaluation{eval},
   417  		EvalToken: w.evalToken,
   418  		WriteRequest: structs.WriteRequest{
   419  			Region: w.srv.config.Region,
   420  		},
   421  	}
   422  	var resp structs.GenericResponse
   423  
   424  SUBMIT:
   425  	// Make the RPC call
   426  	if err := w.srv.RPC("Eval.Create", &req, &resp); err != nil {
   427  		w.logger.Printf("[ERR] worker: failed to create evaluation %#v: %v",
   428  			eval, err)
   429  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   430  			goto SUBMIT
   431  		}
   432  		return err
   433  	} else {
   434  		w.logger.Printf("[DEBUG] worker: created evaluation %#v", eval)
   435  		w.backoffReset()
   436  	}
   437  	return nil
   438  }
   439  
   440  // ReblockEval is used to reinsert a blocked evaluation into the blocked eval
   441  // tracker. This allows the worker to act as the planner for the scheduler.
   442  func (w *Worker) ReblockEval(eval *structs.Evaluation) error {
   443  	// Check for a shutdown before plan submission
   444  	if w.srv.IsShutdown() {
   445  		return fmt.Errorf("shutdown while planning")
   446  	}
   447  	defer metrics.MeasureSince([]string{"nomad", "worker", "reblock_eval"}, time.Now())
   448  
   449  	// Update the evaluation if the queued jobs is not same as what is
   450  	// recorded in the job summary
   451  	ws := memdb.NewWatchSet()
   452  	summary, err := w.srv.fsm.state.JobSummaryByID(ws, eval.Namespace, eval.JobID)
   453  	if err != nil {
   454  		return fmt.Errorf("couldn't retrieve job summary: %v", err)
   455  	}
   456  	if summary != nil {
   457  		var hasChanged bool
   458  		for tg, summary := range summary.Summary {
   459  			if queued, ok := eval.QueuedAllocations[tg]; ok {
   460  				if queued != summary.Queued {
   461  					hasChanged = true
   462  					break
   463  				}
   464  			}
   465  		}
   466  		if hasChanged {
   467  			if err := w.UpdateEval(eval); err != nil {
   468  				return err
   469  			}
   470  		}
   471  	}
   472  
   473  	// Store the snapshot index in the eval
   474  	eval.SnapshotIndex = w.snapshotIndex
   475  
   476  	// Setup the request
   477  	req := structs.EvalUpdateRequest{
   478  		Evals:     []*structs.Evaluation{eval},
   479  		EvalToken: w.evalToken,
   480  		WriteRequest: structs.WriteRequest{
   481  			Region: w.srv.config.Region,
   482  		},
   483  	}
   484  	var resp structs.GenericResponse
   485  
   486  SUBMIT:
   487  	// Make the RPC call
   488  	if err := w.srv.RPC("Eval.Reblock", &req, &resp); err != nil {
   489  		w.logger.Printf("[ERR] worker: failed to reblock evaluation %#v: %v",
   490  			eval, err)
   491  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   492  			goto SUBMIT
   493  		}
   494  		return err
   495  	} else {
   496  		w.logger.Printf("[DEBUG] worker: reblocked evaluation %#v", eval)
   497  		w.backoffReset()
   498  	}
   499  	return nil
   500  }
   501  
   502  // shouldResubmit checks if a given error should be swallowed and the plan
   503  // resubmitted after a backoff. Usually these are transient errors that
   504  // the cluster should heal from quickly.
   505  func (w *Worker) shouldResubmit(err error) bool {
   506  	s := err.Error()
   507  	switch {
   508  	case strings.Contains(s, "No cluster leader"):
   509  		return true
   510  	case strings.Contains(s, "plan queue is disabled"):
   511  		return true
   512  	default:
   513  		return false
   514  	}
   515  }
   516  
   517  // backoffErr is used to do an exponential back off on error. This is
   518  // maintained statefully for the worker. Returns if attempts should be
   519  // abandoned due to shutdown.
   520  func (w *Worker) backoffErr(base, limit time.Duration) bool {
   521  	backoff := (1 << (2 * w.failures)) * base
   522  	if backoff > limit {
   523  		backoff = limit
   524  	} else {
   525  		w.failures++
   526  	}
   527  	select {
   528  	case <-time.After(backoff):
   529  		return false
   530  	case <-w.srv.shutdownCh:
   531  		return true
   532  	}
   533  }
   534  
   535  // backoffReset is used to reset the failure count for
   536  // exponential backoff
   537  func (w *Worker) backoffReset() {
   538  	w.failures = 0
   539  }