github.com/ferranbt/nomad@v0.9.3-0.20190607002617-85c449b7667c/nomad/worker.go

github.com/ferranbt/nomad@v0.9.3-0.20190607002617-85c449b7667c/nomad/worker.go (about)

     1  package nomad
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"strings"
     7  	"sync"
     8  	"time"
     9  
    10  	metrics "github.com/armon/go-metrics"
    11  	log "github.com/hashicorp/go-hclog"
    12  	memdb "github.com/hashicorp/go-memdb"
    13  	"github.com/hashicorp/nomad/nomad/state"
    14  	"github.com/hashicorp/nomad/nomad/structs"
    15  	"github.com/hashicorp/nomad/scheduler"
    16  )
    17  
    18  const (
    19  	// backoffBaselineFast is the baseline time for exponential backoff
    20  	backoffBaselineFast = 20 * time.Millisecond
    21  
    22  	// backoffBaselineSlow is the baseline time for exponential backoff
    23  	// but that is much slower than backoffBaselineFast
    24  	backoffBaselineSlow = 500 * time.Millisecond
    25  
    26  	// backoffLimitSlow is the limit of the exponential backoff for
    27  	// the slower backoff
    28  	backoffLimitSlow = 10 * time.Second
    29  
    30  	// backoffSchedulerVersionMismatch is the backoff between retries when the
    31  	// scheduler version mismatches that of the leader.
    32  	backoffSchedulerVersionMismatch = 30 * time.Second
    33  
    34  	// dequeueTimeout is used to timeout an evaluation dequeue so that
    35  	// we can check if there is a shutdown event
    36  	dequeueTimeout = 500 * time.Millisecond
    37  
    38  	// raftSyncLimit is the limit of time we will wait for Raft replication
    39  	// to catch up to the evaluation. This is used to fast Nack and
    40  	// allow another scheduler to pick it up.
    41  	raftSyncLimit = 5 * time.Second
    42  
    43  	// dequeueErrGrace is the grace period where we don't log about
    44  	// dequeue errors after start. This is to improve the user experience
    45  	// in dev mode where the leader isn't elected for a few seconds.
    46  	dequeueErrGrace = 10 * time.Second
    47  )
    48  
    49  // Worker is a single threaded scheduling worker. There may be multiple
    50  // running per server (leader or follower). They are responsible for dequeuing
    51  // pending evaluations, invoking schedulers, plan submission and the
    52  // lifecycle around making task allocations. They bridge the business logic
    53  // of the scheduler with the plumbing required to make it all work.
    54  type Worker struct {
    55  	srv    *Server
    56  	logger log.Logger
    57  	start  time.Time
    58  
    59  	paused    bool
    60  	pauseLock sync.Mutex
    61  	pauseCond *sync.Cond
    62  
    63  	failures uint
    64  
    65  	evalToken string
    66  
    67  	// snapshotIndex is the index of the snapshot in which the scheduler was
    68  	// first invoked. It is used to mark the SnapshotIndex of evaluations
    69  	// Created, Updated or Reblocked.
    70  	snapshotIndex uint64
    71  }
    72  
    73  // NewWorker starts a new worker associated with the given server
    74  func NewWorker(srv *Server) (*Worker, error) {
    75  	w := &Worker{
    76  		srv:    srv,
    77  		logger: srv.logger.ResetNamed("worker"),
    78  		start:  time.Now(),
    79  	}
    80  	w.pauseCond = sync.NewCond(&w.pauseLock)
    81  	go w.run()
    82  	return w, nil
    83  }
    84  
    85  // SetPause is used to pause or unpause a worker
    86  func (w *Worker) SetPause(p bool) {
    87  	w.pauseLock.Lock()
    88  	w.paused = p
    89  	w.pauseLock.Unlock()
    90  	if !p {
    91  		w.pauseCond.Broadcast()
    92  	}
    93  }
    94  
    95  // checkPaused is used to park the worker when paused
    96  func (w *Worker) checkPaused() {
    97  	w.pauseLock.Lock()
    98  	for w.paused {
    99  		w.pauseCond.Wait()
   100  	}
   101  	w.pauseLock.Unlock()
   102  }
   103  
   104  // run is the long-lived goroutine which is used to run the worker
   105  func (w *Worker) run() {
   106  	for {
   107  		// Dequeue a pending evaluation
   108  		eval, token, waitIndex, shutdown := w.dequeueEvaluation(dequeueTimeout)
   109  		if shutdown {
   110  			return
   111  		}
   112  
   113  		// Check for a shutdown
   114  		if w.srv.IsShutdown() {
   115  			w.logger.Error("nacking eval because the server is shutting down", "eval", log.Fmt("%#v", eval))
   116  			w.sendAck(eval.ID, token, false)
   117  			return
   118  		}
   119  
   120  		// Wait for the raft log to catchup to the evaluation
   121  		snap, err := w.snapshotAfter(waitIndex, raftSyncLimit)
   122  		if err != nil {
   123  			w.logger.Error("error waiting for Raft index", "error", err, "index", waitIndex)
   124  			w.sendAck(eval.ID, token, false)
   125  			continue
   126  		}
   127  
   128  		// Invoke the scheduler to determine placements
   129  		if err := w.invokeScheduler(snap, eval, token); err != nil {
   130  			w.logger.Error("error invoking scheduler", "error", err)
   131  			w.sendAck(eval.ID, token, false)
   132  			continue
   133  		}
   134  
   135  		// Complete the evaluation
   136  		w.sendAck(eval.ID, token, true)
   137  	}
   138  }
   139  
   140  // dequeueEvaluation is used to fetch the next ready evaluation.
   141  // This blocks until an evaluation is available or a timeout is reached.
   142  func (w *Worker) dequeueEvaluation(timeout time.Duration) (
   143  	eval *structs.Evaluation, token string, waitIndex uint64, shutdown bool) {
   144  	// Setup the request
   145  	req := structs.EvalDequeueRequest{
   146  		Schedulers:       w.srv.config.EnabledSchedulers,
   147  		Timeout:          timeout,
   148  		SchedulerVersion: scheduler.SchedulerVersion,
   149  		WriteRequest: structs.WriteRequest{
   150  			Region: w.srv.config.Region,
   151  		},
   152  	}
   153  	var resp structs.EvalDequeueResponse
   154  
   155  REQ:
   156  	// Check if we are paused
   157  	w.checkPaused()
   158  
   159  	// Make a blocking RPC
   160  	start := time.Now()
   161  	err := w.srv.RPC("Eval.Dequeue", &req, &resp)
   162  	metrics.MeasureSince([]string{"nomad", "worker", "dequeue_eval"}, start)
   163  	if err != nil {
   164  		if time.Since(w.start) > dequeueErrGrace && !w.srv.IsShutdown() {
   165  			w.logger.Error("failed to dequeue evaluation", "error", err)
   166  		}
   167  
   168  		// Adjust the backoff based on the error. If it is a scheduler version
   169  		// mismatch we increase the baseline.
   170  		base, limit := backoffBaselineFast, backoffLimitSlow
   171  		if strings.Contains(err.Error(), "calling scheduler version") {
   172  			base = backoffSchedulerVersionMismatch
   173  			limit = backoffSchedulerVersionMismatch
   174  		}
   175  
   176  		if w.backoffErr(base, limit) {
   177  			return nil, "", 0, true
   178  		}
   179  		goto REQ
   180  	}
   181  	w.backoffReset()
   182  
   183  	// Check if we got a response
   184  	if resp.Eval != nil {
   185  		w.logger.Debug("dequeued evaluation", "eval_id", resp.Eval.ID)
   186  		return resp.Eval, resp.Token, resp.GetWaitIndex(), false
   187  	}
   188  
   189  	// Check for potential shutdown
   190  	if w.srv.IsShutdown() {
   191  		return nil, "", 0, true
   192  	}
   193  	goto REQ
   194  }
   195  
   196  // sendAck makes a best effort to ack or nack the evaluation.
   197  // Any errors are logged but swallowed.
   198  func (w *Worker) sendAck(evalID, token string, ack bool) {
   199  	defer metrics.MeasureSince([]string{"nomad", "worker", "send_ack"}, time.Now())
   200  	// Setup the request
   201  	req := structs.EvalAckRequest{
   202  		EvalID: evalID,
   203  		Token:  token,
   204  		WriteRequest: structs.WriteRequest{
   205  			Region: w.srv.config.Region,
   206  		},
   207  	}
   208  	var resp structs.GenericResponse
   209  
   210  	// Determine if this is an Ack or Nack
   211  	verb := "ack"
   212  	endpoint := "Eval.Ack"
   213  	if !ack {
   214  		verb = "nack"
   215  		endpoint = "Eval.Nack"
   216  	}
   217  
   218  	// Make the RPC call
   219  	err := w.srv.RPC(endpoint, &req, &resp)
   220  	if err != nil {
   221  		w.logger.Error(fmt.Sprintf("failed to %s evaluation", verb), "eval_id", evalID, "error", err)
   222  	} else {
   223  		w.logger.Debug(fmt.Sprintf("%s evaluation", verb), "eval_id", evalID)
   224  	}
   225  }
   226  
   227  // snapshotAfter times calls to StateStore.SnapshotAfter which may block.
   228  func (w *Worker) snapshotAfter(waitIndex uint64, timeout time.Duration) (*state.StateSnapshot, error) {
   229  	start := time.Now()
   230  	ctx, cancel := context.WithTimeout(w.srv.shutdownCtx, timeout)
   231  	snap, err := w.srv.fsm.State().SnapshotAfter(ctx, waitIndex)
   232  	cancel()
   233  	metrics.MeasureSince([]string{"nomad", "worker", "wait_for_index"}, start)
   234  
   235  	// Wrap error to ensure callers don't disregard timeouts.
   236  	if err == context.DeadlineExceeded {
   237  		err = fmt.Errorf("timed out after %s waiting for index=%d", timeout, waitIndex)
   238  	}
   239  
   240  	return snap, err
   241  }
   242  
   243  // invokeScheduler is used to invoke the business logic of the scheduler
   244  func (w *Worker) invokeScheduler(snap *state.StateSnapshot, eval *structs.Evaluation, token string) error {
   245  	defer metrics.MeasureSince([]string{"nomad", "worker", "invoke_scheduler", eval.Type}, time.Now())
   246  	// Store the evaluation token
   247  	w.evalToken = token
   248  
   249  	// Store the snapshot's index
   250  	var err error
   251  	w.snapshotIndex, err = snap.LatestIndex()
   252  	if err != nil {
   253  		return fmt.Errorf("failed to determine snapshot's index: %v", err)
   254  	}
   255  
   256  	// Create the scheduler, or use the special system scheduler
   257  	var sched scheduler.Scheduler
   258  	if eval.Type == structs.JobTypeCore {
   259  		sched = NewCoreScheduler(w.srv, snap)
   260  	} else {
   261  		sched, err = scheduler.NewScheduler(eval.Type, w.logger, snap, w)
   262  		if err != nil {
   263  			return fmt.Errorf("failed to instantiate scheduler: %v", err)
   264  		}
   265  	}
   266  
   267  	// Process the evaluation
   268  	err = sched.Process(eval)
   269  	if err != nil {
   270  		return fmt.Errorf("failed to process evaluation: %v", err)
   271  	}
   272  	return nil
   273  }
   274  
   275  // SubmitPlan is used to submit a plan for consideration. This allows
   276  // the worker to act as the planner for the scheduler.
   277  func (w *Worker) SubmitPlan(plan *structs.Plan) (*structs.PlanResult, scheduler.State, error) {
   278  	// Check for a shutdown before plan submission
   279  	if w.srv.IsShutdown() {
   280  		return nil, nil, fmt.Errorf("shutdown while planning")
   281  	}
   282  	defer metrics.MeasureSince([]string{"nomad", "worker", "submit_plan"}, time.Now())
   283  
   284  	// Add the evaluation token to the plan
   285  	plan.EvalToken = w.evalToken
   286  
   287  	// Normalize stopped and preempted allocs before RPC
   288  	normalizePlan := ServersMeetMinimumVersion(w.srv.Members(), MinVersionPlanNormalization, true)
   289  	if normalizePlan {
   290  		plan.NormalizeAllocations()
   291  	}
   292  
   293  	// Setup the request
   294  	req := structs.PlanRequest{
   295  		Plan: plan,
   296  		WriteRequest: structs.WriteRequest{
   297  			Region: w.srv.config.Region,
   298  		},
   299  	}
   300  	var resp structs.PlanResponse
   301  
   302  SUBMIT:
   303  	// Make the RPC call
   304  	if err := w.srv.RPC("Plan.Submit", &req, &resp); err != nil {
   305  		w.logger.Error("failed to submit plan for evaluation", "eval_id", plan.EvalID, "error", err)
   306  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   307  			goto SUBMIT
   308  		}
   309  		return nil, nil, err
   310  	} else {
   311  		w.logger.Debug("submitted plan for evaluation", "eval_id", plan.EvalID)
   312  		w.backoffReset()
   313  	}
   314  
   315  	// Look for a result
   316  	result := resp.Result
   317  	if result == nil {
   318  		return nil, nil, fmt.Errorf("missing result")
   319  	}
   320  
   321  	// Check if a state update is required. This could be required if we
   322  	// planning based on stale data, which is causing issues. For example, a
   323  	// node failure since the time we've started planning or conflicting task
   324  	// allocations.
   325  	var state scheduler.State
   326  	if result.RefreshIndex != 0 {
   327  		// Wait for the raft log to catchup to the evaluation
   328  		w.logger.Debug("refreshing state", "refresh_index", result.RefreshIndex, "eval_id", plan.EvalID)
   329  
   330  		var err error
   331  		state, err = w.snapshotAfter(result.RefreshIndex, raftSyncLimit)
   332  		if err != nil {
   333  			return nil, nil, err
   334  		}
   335  	}
   336  
   337  	// Return the result and potential state update
   338  	return result, state, nil
   339  }
   340  
   341  // UpdateEval is used to submit an updated evaluation. This allows
   342  // the worker to act as the planner for the scheduler.
   343  func (w *Worker) UpdateEval(eval *structs.Evaluation) error {
   344  	// Check for a shutdown before plan submission
   345  	if w.srv.IsShutdown() {
   346  		return fmt.Errorf("shutdown while planning")
   347  	}
   348  	defer metrics.MeasureSince([]string{"nomad", "worker", "update_eval"}, time.Now())
   349  
   350  	// Store the snapshot index in the eval
   351  	eval.SnapshotIndex = w.snapshotIndex
   352  
   353  	// Setup the request
   354  	req := structs.EvalUpdateRequest{
   355  		Evals:     []*structs.Evaluation{eval},
   356  		EvalToken: w.evalToken,
   357  		WriteRequest: structs.WriteRequest{
   358  			Region: w.srv.config.Region,
   359  		},
   360  	}
   361  	var resp structs.GenericResponse
   362  
   363  SUBMIT:
   364  	// Make the RPC call
   365  	if err := w.srv.RPC("Eval.Update", &req, &resp); err != nil {
   366  		w.logger.Error("failed to update evaluation", "eval", log.Fmt("%#v", eval), "error", err)
   367  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   368  			goto SUBMIT
   369  		}
   370  		return err
   371  	} else {
   372  		w.logger.Debug("updated evaluation", "eval", log.Fmt("%#v", eval))
   373  		w.backoffReset()
   374  	}
   375  	return nil
   376  }
   377  
   378  // CreateEval is used to create a new evaluation. This allows
   379  // the worker to act as the planner for the scheduler.
   380  func (w *Worker) CreateEval(eval *structs.Evaluation) error {
   381  	// Check for a shutdown before plan submission
   382  	if w.srv.IsShutdown() {
   383  		return fmt.Errorf("shutdown while planning")
   384  	}
   385  	defer metrics.MeasureSince([]string{"nomad", "worker", "create_eval"}, time.Now())
   386  
   387  	// Store the snapshot index in the eval
   388  	eval.SnapshotIndex = w.snapshotIndex
   389  
   390  	// Setup the request
   391  	req := structs.EvalUpdateRequest{
   392  		Evals:     []*structs.Evaluation{eval},
   393  		EvalToken: w.evalToken,
   394  		WriteRequest: structs.WriteRequest{
   395  			Region: w.srv.config.Region,
   396  		},
   397  	}
   398  	var resp structs.GenericResponse
   399  
   400  SUBMIT:
   401  	// Make the RPC call
   402  	if err := w.srv.RPC("Eval.Create", &req, &resp); err != nil {
   403  		w.logger.Error("failed to create evaluation", "eval", log.Fmt("%#v", eval), "error", err)
   404  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   405  			goto SUBMIT
   406  		}
   407  		return err
   408  	} else {
   409  		w.logger.Debug("created evaluation", "eval", log.Fmt("%#v", eval))
   410  		w.backoffReset()
   411  	}
   412  	return nil
   413  }
   414  
   415  // ReblockEval is used to reinsert a blocked evaluation into the blocked eval
   416  // tracker. This allows the worker to act as the planner for the scheduler.
   417  func (w *Worker) ReblockEval(eval *structs.Evaluation) error {
   418  	// Check for a shutdown before plan submission
   419  	if w.srv.IsShutdown() {
   420  		return fmt.Errorf("shutdown while planning")
   421  	}
   422  	defer metrics.MeasureSince([]string{"nomad", "worker", "reblock_eval"}, time.Now())
   423  
   424  	// Update the evaluation if the queued jobs is not same as what is
   425  	// recorded in the job summary
   426  	ws := memdb.NewWatchSet()
   427  	summary, err := w.srv.fsm.state.JobSummaryByID(ws, eval.Namespace, eval.JobID)
   428  	if err != nil {
   429  		return fmt.Errorf("couldn't retrieve job summary: %v", err)
   430  	}
   431  	if summary != nil {
   432  		var hasChanged bool
   433  		for tg, summary := range summary.Summary {
   434  			if queued, ok := eval.QueuedAllocations[tg]; ok {
   435  				if queued != summary.Queued {
   436  					hasChanged = true
   437  					break
   438  				}
   439  			}
   440  		}
   441  		if hasChanged {
   442  			if err := w.UpdateEval(eval); err != nil {
   443  				return err
   444  			}
   445  		}
   446  	}
   447  
   448  	// Store the snapshot index in the eval
   449  	eval.SnapshotIndex = w.snapshotIndex
   450  
   451  	// Setup the request
   452  	req := structs.EvalUpdateRequest{
   453  		Evals:     []*structs.Evaluation{eval},
   454  		EvalToken: w.evalToken,
   455  		WriteRequest: structs.WriteRequest{
   456  			Region: w.srv.config.Region,
   457  		},
   458  	}
   459  	var resp structs.GenericResponse
   460  
   461  SUBMIT:
   462  	// Make the RPC call
   463  	if err := w.srv.RPC("Eval.Reblock", &req, &resp); err != nil {
   464  		w.logger.Error("failed to reblock evaluation", "eval", log.Fmt("%#v", eval), "error", err)
   465  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   466  			goto SUBMIT
   467  		}
   468  		return err
   469  	} else {
   470  		w.logger.Debug("reblocked evaluation", "eval", log.Fmt("%#v", eval))
   471  		w.backoffReset()
   472  	}
   473  	return nil
   474  }
   475  
   476  // shouldResubmit checks if a given error should be swallowed and the plan
   477  // resubmitted after a backoff. Usually these are transient errors that
   478  // the cluster should heal from quickly.
   479  func (w *Worker) shouldResubmit(err error) bool {
   480  	s := err.Error()
   481  	switch {
   482  	case strings.Contains(s, "No cluster leader"):
   483  		return true
   484  	case strings.Contains(s, "plan queue is disabled"):
   485  		return true
   486  	default:
   487  		return false
   488  	}
   489  }
   490  
   491  // backoffErr is used to do an exponential back off on error. This is
   492  // maintained statefully for the worker. Returns if attempts should be
   493  // abandoned due to shutdown.
   494  func (w *Worker) backoffErr(base, limit time.Duration) bool {
   495  	backoff := (1 << (2 * w.failures)) * base
   496  	if backoff > limit {
   497  		backoff = limit
   498  	} else {
   499  		w.failures++
   500  	}
   501  	select {
   502  	case <-time.After(backoff):
   503  		return false
   504  	case <-w.srv.shutdownCh:
   505  		return true
   506  	}
   507  }
   508  
   509  // backoffReset is used to reset the failure count for
   510  // exponential backoff
   511  func (w *Worker) backoffReset() {
   512  	w.failures = 0
   513  }