github.com/diptanu/nomad@v0.5.7-0.20170516172507-d72e86cbe3d9/nomad/worker.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  	"strings"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/armon/go-metrics"
    11  	memdb "github.com/hashicorp/go-memdb"
    12  	"github.com/hashicorp/nomad/nomad/structs"
    13  	"github.com/hashicorp/nomad/scheduler"
    14  )
    15  
    16  const (
    17  	// backoffBaselineFast is the baseline time for exponential backoff
    18  	backoffBaselineFast = 20 * time.Millisecond
    19  
    20  	// backoffBaselineSlow is the baseline time for exponential backoff
    21  	// but that is much slower than backoffBaselineFast
    22  	backoffBaselineSlow = 500 * time.Millisecond
    23  
    24  	// backoffLimitFast is the limit of the exponential backoff
    25  	backoffLimitFast = time.Second
    26  
    27  	// backoffLimitSlow is the limit of the exponential backoff for
    28  	// the slower backoff
    29  	backoffLimitSlow = 10 * time.Second
    30  
    31  	// backoffSchedulerVersionMismatch is the backoff between retries when the
    32  	// scheduler version mismatches that of the leader.
    33  	backoffSchedulerVersionMismatch = 30 * time.Second
    34  
    35  	// dequeueTimeout is used to timeout an evaluation dequeue so that
    36  	// we can check if there is a shutdown event
    37  	dequeueTimeout = 500 * time.Millisecond
    38  
    39  	// raftSyncLimit is the limit of time we will wait for Raft replication
    40  	// to catch up to the evaluation. This is used to fast Nack and
    41  	// allow another scheduler to pick it up.
    42  	raftSyncLimit = 5 * time.Second
    43  
    44  	// dequeueErrGrace is the grace period where we don't log about
    45  	// dequeue errors after start. This is to improve the user experience
    46  	// in dev mode where the leader isn't elected for a few seconds.
    47  	dequeueErrGrace = 10 * time.Second
    48  )
    49  
    50  // Worker is a single threaded scheduling worker. There may be multiple
    51  // running per server (leader or follower). They are responsible for dequeuing
    52  // pending evaluations, invoking schedulers, plan submission and the
    53  // lifecycle around making task allocations. They bridge the business logic
    54  // of the scheduler with the plumbing required to make it all work.
    55  type Worker struct {
    56  	srv    *Server
    57  	logger *log.Logger
    58  	start  time.Time
    59  
    60  	paused    bool
    61  	pauseLock sync.Mutex
    62  	pauseCond *sync.Cond
    63  
    64  	failures uint
    65  
    66  	evalToken string
    67  
    68  	// snapshotIndex is the index of the snapshot in which the scheduler was
    69  	// first envoked. It is used to mark the SnapshotIndex of evaluations
    70  	// Created, Updated or Reblocked.
    71  	snapshotIndex uint64
    72  }
    73  
    74  // NewWorker starts a new worker associated with the given server
    75  func NewWorker(srv *Server) (*Worker, error) {
    76  	w := &Worker{
    77  		srv:    srv,
    78  		logger: srv.logger,
    79  		start:  time.Now(),
    80  	}
    81  	w.pauseCond = sync.NewCond(&w.pauseLock)
    82  	go w.run()
    83  	return w, nil
    84  }
    85  
    86  // SetPause is used to pause or unpause a worker
    87  func (w *Worker) SetPause(p bool) {
    88  	w.pauseLock.Lock()
    89  	w.paused = p
    90  	w.pauseLock.Unlock()
    91  	if !p {
    92  		w.pauseCond.Broadcast()
    93  	}
    94  }
    95  
    96  // checkPaused is used to park the worker when paused
    97  func (w *Worker) checkPaused() {
    98  	w.pauseLock.Lock()
    99  	for w.paused {
   100  		w.pauseCond.Wait()
   101  	}
   102  	w.pauseLock.Unlock()
   103  }
   104  
   105  // run is the long-lived goroutine which is used to run the worker
   106  func (w *Worker) run() {
   107  	for {
   108  		// Dequeue a pending evaluation
   109  		eval, token, shutdown := w.dequeueEvaluation(dequeueTimeout)
   110  		if shutdown {
   111  			return
   112  		}
   113  
   114  		// Check for a shutdown
   115  		if w.srv.IsShutdown() {
   116  			w.sendAck(eval.ID, token, false)
   117  			return
   118  		}
   119  
   120  		// Wait for the raft log to catchup to the evaluation
   121  		if err := w.waitForIndex(eval.ModifyIndex, raftSyncLimit); err != nil {
   122  			w.sendAck(eval.ID, token, false)
   123  			continue
   124  		}
   125  
   126  		// Invoke the scheduler to determine placements
   127  		if err := w.invokeScheduler(eval, token); err != nil {
   128  			w.sendAck(eval.ID, token, false)
   129  			continue
   130  		}
   131  
   132  		// Complete the evaluation
   133  		w.sendAck(eval.ID, token, true)
   134  	}
   135  }
   136  
   137  // dequeueEvaluation is used to fetch the next ready evaluation.
   138  // This blocks until an evaluation is available or a timeout is reached.
   139  func (w *Worker) dequeueEvaluation(timeout time.Duration) (*structs.Evaluation, string, bool) {
   140  	// Setup the request
   141  	req := structs.EvalDequeueRequest{
   142  		Schedulers:       w.srv.config.EnabledSchedulers,
   143  		Timeout:          timeout,
   144  		SchedulerVersion: scheduler.SchedulerVersion,
   145  		WriteRequest: structs.WriteRequest{
   146  			Region: w.srv.config.Region,
   147  		},
   148  	}
   149  	var resp structs.EvalDequeueResponse
   150  
   151  REQ:
   152  	// Check if we are paused
   153  	w.checkPaused()
   154  
   155  	// Make a blocking RPC
   156  	start := time.Now()
   157  	err := w.srv.RPC("Eval.Dequeue", &req, &resp)
   158  	metrics.MeasureSince([]string{"nomad", "worker", "dequeue_eval"}, start)
   159  	if err != nil {
   160  		if time.Since(w.start) > dequeueErrGrace && !w.srv.IsShutdown() {
   161  			w.logger.Printf("[ERR] worker: failed to dequeue evaluation: %v", err)
   162  		}
   163  
   164  		// Adjust the backoff based on the error. If it is a scheduler version
   165  		// mismatch we increase the baseline.
   166  		base, limit := backoffBaselineFast, backoffLimitSlow
   167  		if strings.Contains(err.Error(), "calling scheduler version") {
   168  			base = backoffSchedulerVersionMismatch
   169  			limit = backoffSchedulerVersionMismatch
   170  		}
   171  
   172  		if w.backoffErr(base, limit) {
   173  			return nil, "", true
   174  		}
   175  		goto REQ
   176  	}
   177  	w.backoffReset()
   178  
   179  	// Check if we got a response
   180  	if resp.Eval != nil {
   181  		w.logger.Printf("[DEBUG] worker: dequeued evaluation %s", resp.Eval.ID)
   182  		return resp.Eval, resp.Token, false
   183  	}
   184  
   185  	// Check for potential shutdown
   186  	if w.srv.IsShutdown() {
   187  		return nil, "", true
   188  	}
   189  	goto REQ
   190  }
   191  
   192  // sendAck makes a best effort to ack or nack the evaluation.
   193  // Any errors are logged but swallowed.
   194  func (w *Worker) sendAck(evalID, token string, ack bool) {
   195  	defer metrics.MeasureSince([]string{"nomad", "worker", "send_ack"}, time.Now())
   196  	// Setup the request
   197  	req := structs.EvalAckRequest{
   198  		EvalID: evalID,
   199  		Token:  token,
   200  		WriteRequest: structs.WriteRequest{
   201  			Region: w.srv.config.Region,
   202  		},
   203  	}
   204  	var resp structs.GenericResponse
   205  
   206  	// Determine if this is an Ack or Nack
   207  	verb := "ack"
   208  	endpoint := "Eval.Ack"
   209  	if !ack {
   210  		verb = "nack"
   211  		endpoint = "Eval.Nack"
   212  	}
   213  
   214  	// Make the RPC call
   215  	err := w.srv.RPC(endpoint, &req, &resp)
   216  	if err != nil {
   217  		w.logger.Printf("[ERR] worker: failed to %s evaluation '%s': %v",
   218  			verb, evalID, err)
   219  	} else {
   220  		w.logger.Printf("[DEBUG] worker: %s for evaluation %s", verb, evalID)
   221  	}
   222  }
   223  
   224  // waitForIndex ensures that the local state is at least as fresh
   225  // as the given index. This is used before starting an evaluation,
   226  // but also potentially mid-stream. If a Plan fails because of stale
   227  // state (attempt to allocate to a failed/dead node), we may need
   228  // to sync our state again and do the planning with more recent data.
   229  func (w *Worker) waitForIndex(index uint64, timeout time.Duration) error {
   230  	// XXX: Potential optimization is to set up a watch on the state stores
   231  	// index table and only unblock via a trigger rather than timing out and
   232  	// checking.
   233  
   234  	start := time.Now()
   235  	defer metrics.MeasureSince([]string{"nomad", "worker", "wait_for_index"}, start)
   236  CHECK:
   237  	// Get the states current index
   238  	snapshotIndex, err := w.srv.fsm.State().LatestIndex()
   239  	if err != nil {
   240  		return fmt.Errorf("failed to determine state store's index: %v", err)
   241  	}
   242  
   243  	// We only need the FSM state to be as recent as the given index
   244  	if index <= snapshotIndex {
   245  		w.backoffReset()
   246  		return nil
   247  	}
   248  
   249  	// Check if we've reached our limit
   250  	if time.Now().Sub(start) > timeout {
   251  		return fmt.Errorf("sync wait timeout reached")
   252  	}
   253  
   254  	// Exponential back off if we haven't yet reached it
   255  	if w.backoffErr(backoffBaselineFast, backoffLimitFast) {
   256  		return fmt.Errorf("shutdown while waiting for state sync")
   257  	}
   258  	goto CHECK
   259  }
   260  
   261  // invokeScheduler is used to invoke the business logic of the scheduler
   262  func (w *Worker) invokeScheduler(eval *structs.Evaluation, token string) error {
   263  	defer metrics.MeasureSince([]string{"nomad", "worker", "invoke_scheduler", eval.Type}, time.Now())
   264  	// Store the evaluation token
   265  	w.evalToken = token
   266  
   267  	// Snapshot the current state
   268  	snap, err := w.srv.fsm.State().Snapshot()
   269  	if err != nil {
   270  		return fmt.Errorf("failed to snapshot state: %v", err)
   271  	}
   272  
   273  	// Store the snapshot's index
   274  	w.snapshotIndex, err = snap.LatestIndex()
   275  	if err != nil {
   276  		return fmt.Errorf("failed to determine snapshot's index: %v", err)
   277  	}
   278  
   279  	// Create the scheduler, or use the special system scheduler
   280  	var sched scheduler.Scheduler
   281  	if eval.Type == structs.JobTypeCore {
   282  		sched = NewCoreScheduler(w.srv, snap)
   283  	} else {
   284  		sched, err = scheduler.NewScheduler(eval.Type, w.logger, snap, w)
   285  		if err != nil {
   286  			return fmt.Errorf("failed to instantiate scheduler: %v", err)
   287  		}
   288  	}
   289  
   290  	// Process the evaluation
   291  	err = sched.Process(eval)
   292  	if err != nil {
   293  		return fmt.Errorf("failed to process evaluation: %v", err)
   294  	}
   295  	return nil
   296  }
   297  
   298  // SubmitPlan is used to submit a plan for consideration. This allows
   299  // the worker to act as the planner for the scheduler.
   300  func (w *Worker) SubmitPlan(plan *structs.Plan) (*structs.PlanResult, scheduler.State, error) {
   301  	// Check for a shutdown before plan submission
   302  	if w.srv.IsShutdown() {
   303  		return nil, nil, fmt.Errorf("shutdown while planning")
   304  	}
   305  	defer metrics.MeasureSince([]string{"nomad", "worker", "submit_plan"}, time.Now())
   306  
   307  	// Add the evaluation token to the plan
   308  	plan.EvalToken = w.evalToken
   309  
   310  	// Setup the request
   311  	req := structs.PlanRequest{
   312  		Plan: plan,
   313  		WriteRequest: structs.WriteRequest{
   314  			Region: w.srv.config.Region,
   315  		},
   316  	}
   317  	var resp structs.PlanResponse
   318  
   319  SUBMIT:
   320  	// Make the RPC call
   321  	if err := w.srv.RPC("Plan.Submit", &req, &resp); err != nil {
   322  		w.logger.Printf("[ERR] worker: failed to submit plan for evaluation %s: %v",
   323  			plan.EvalID, err)
   324  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   325  			goto SUBMIT
   326  		}
   327  		return nil, nil, err
   328  	} else {
   329  		w.logger.Printf("[DEBUG] worker: submitted plan for evaluation %s", plan.EvalID)
   330  		w.backoffReset()
   331  	}
   332  
   333  	// Look for a result
   334  	result := resp.Result
   335  	if result == nil {
   336  		return nil, nil, fmt.Errorf("missing result")
   337  	}
   338  
   339  	// Check if a state update is required. This could be required if we
   340  	// planning based on stale data, which is causing issues. For example, a
   341  	// node failure since the time we've started planning or conflicting task
   342  	// allocations.
   343  	var state scheduler.State
   344  	if result.RefreshIndex != 0 {
   345  		// Wait for the raft log to catchup to the evaluation
   346  		w.logger.Printf("[DEBUG] worker: refreshing state to index %d for %q", result.RefreshIndex, plan.EvalID)
   347  		if err := w.waitForIndex(result.RefreshIndex, raftSyncLimit); err != nil {
   348  			return nil, nil, err
   349  		}
   350  
   351  		// Snapshot the current state
   352  		snap, err := w.srv.fsm.State().Snapshot()
   353  		if err != nil {
   354  			return nil, nil, fmt.Errorf("failed to snapshot state: %v", err)
   355  		}
   356  		state = snap
   357  	}
   358  
   359  	// Return the result and potential state update
   360  	return result, state, nil
   361  }
   362  
   363  // UpdateEval is used to submit an updated evaluation. This allows
   364  // the worker to act as the planner for the scheduler.
   365  func (w *Worker) UpdateEval(eval *structs.Evaluation) error {
   366  	// Check for a shutdown before plan submission
   367  	if w.srv.IsShutdown() {
   368  		return fmt.Errorf("shutdown while planning")
   369  	}
   370  	defer metrics.MeasureSince([]string{"nomad", "worker", "update_eval"}, time.Now())
   371  
   372  	// Store the snapshot index in the eval
   373  	eval.SnapshotIndex = w.snapshotIndex
   374  
   375  	// Setup the request
   376  	req := structs.EvalUpdateRequest{
   377  		Evals:     []*structs.Evaluation{eval},
   378  		EvalToken: w.evalToken,
   379  		WriteRequest: structs.WriteRequest{
   380  			Region: w.srv.config.Region,
   381  		},
   382  	}
   383  	var resp structs.GenericResponse
   384  
   385  SUBMIT:
   386  	// Make the RPC call
   387  	if err := w.srv.RPC("Eval.Update", &req, &resp); err != nil {
   388  		w.logger.Printf("[ERR] worker: failed to update evaluation %#v: %v",
   389  			eval, err)
   390  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   391  			goto SUBMIT
   392  		}
   393  		return err
   394  	} else {
   395  		w.logger.Printf("[DEBUG] worker: updated evaluation %#v", eval)
   396  		w.backoffReset()
   397  	}
   398  	return nil
   399  }
   400  
   401  // CreateEval is used to create a new evaluation. This allows
   402  // the worker to act as the planner for the scheduler.
   403  func (w *Worker) CreateEval(eval *structs.Evaluation) error {
   404  	// Check for a shutdown before plan submission
   405  	if w.srv.IsShutdown() {
   406  		return fmt.Errorf("shutdown while planning")
   407  	}
   408  	defer metrics.MeasureSince([]string{"nomad", "worker", "create_eval"}, time.Now())
   409  
   410  	// Store the snapshot index in the eval
   411  	eval.SnapshotIndex = w.snapshotIndex
   412  
   413  	// Setup the request
   414  	req := structs.EvalUpdateRequest{
   415  		Evals:     []*structs.Evaluation{eval},
   416  		EvalToken: w.evalToken,
   417  		WriteRequest: structs.WriteRequest{
   418  			Region: w.srv.config.Region,
   419  		},
   420  	}
   421  	var resp structs.GenericResponse
   422  
   423  SUBMIT:
   424  	// Make the RPC call
   425  	if err := w.srv.RPC("Eval.Create", &req, &resp); err != nil {
   426  		w.logger.Printf("[ERR] worker: failed to create evaluation %#v: %v",
   427  			eval, err)
   428  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   429  			goto SUBMIT
   430  		}
   431  		return err
   432  	} else {
   433  		w.logger.Printf("[DEBUG] worker: created evaluation %#v", eval)
   434  		w.backoffReset()
   435  	}
   436  	return nil
   437  }
   438  
   439  // ReblockEval is used to reinsert a blocked evaluation into the blocked eval
   440  // tracker. This allows the worker to act as the planner for the scheduler.
   441  func (w *Worker) ReblockEval(eval *structs.Evaluation) error {
   442  	// Check for a shutdown before plan submission
   443  	if w.srv.IsShutdown() {
   444  		return fmt.Errorf("shutdown while planning")
   445  	}
   446  	defer metrics.MeasureSince([]string{"nomad", "worker", "reblock_eval"}, time.Now())
   447  
   448  	// Update the evaluation if the queued jobs is not same as what is
   449  	// recorded in the job summary
   450  	ws := memdb.NewWatchSet()
   451  	summary, err := w.srv.fsm.state.JobSummaryByID(ws, eval.JobID)
   452  	if err != nil {
   453  		return fmt.Errorf("couldn't retreive job summary: %v", err)
   454  	}
   455  	if summary != nil {
   456  		var hasChanged bool
   457  		for tg, summary := range summary.Summary {
   458  			if queued, ok := eval.QueuedAllocations[tg]; ok {
   459  				if queued != summary.Queued {
   460  					hasChanged = true
   461  					break
   462  				}
   463  			}
   464  		}
   465  		if hasChanged {
   466  			if err := w.UpdateEval(eval); err != nil {
   467  				return err
   468  			}
   469  		}
   470  	}
   471  
   472  	// Store the snapshot index in the eval
   473  	eval.SnapshotIndex = w.snapshotIndex
   474  
   475  	// Setup the request
   476  	req := structs.EvalUpdateRequest{
   477  		Evals:     []*structs.Evaluation{eval},
   478  		EvalToken: w.evalToken,
   479  		WriteRequest: structs.WriteRequest{
   480  			Region: w.srv.config.Region,
   481  		},
   482  	}
   483  	var resp structs.GenericResponse
   484  
   485  SUBMIT:
   486  	// Make the RPC call
   487  	if err := w.srv.RPC("Eval.Reblock", &req, &resp); err != nil {
   488  		w.logger.Printf("[ERR] worker: failed to reblock evaluation %#v: %v",
   489  			eval, err)
   490  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   491  			goto SUBMIT
   492  		}
   493  		return err
   494  	} else {
   495  		w.logger.Printf("[DEBUG] worker: reblocked evaluation %#v", eval)
   496  		w.backoffReset()
   497  	}
   498  	return nil
   499  }
   500  
   501  // shouldResubmit checks if a given error should be swallowed and the plan
   502  // resubmitted after a backoff. Usually these are transient errors that
   503  // the cluster should heal from quickly.
   504  func (w *Worker) shouldResubmit(err error) bool {
   505  	s := err.Error()
   506  	switch {
   507  	case strings.Contains(s, "No cluster leader"):
   508  		return true
   509  	case strings.Contains(s, "plan queue is disabled"):
   510  		return true
   511  	default:
   512  		return false
   513  	}
   514  }
   515  
   516  // backoffErr is used to do an exponential back off on error. This is
   517  // maintained statefully for the worker. Returns if attempts should be
   518  // abandoneded due to shutdown.
   519  func (w *Worker) backoffErr(base, limit time.Duration) bool {
   520  	backoff := (1 << (2 * w.failures)) * base
   521  	if backoff > limit {
   522  		backoff = limit
   523  	} else {
   524  		w.failures++
   525  	}
   526  	select {
   527  	case <-time.After(backoff):
   528  		return false
   529  	case <-w.srv.shutdownCh:
   530  		return true
   531  	}
   532  }
   533  
   534  // backoffReset is used to reset the failure count for
   535  // exponential backoff
   536  func (w *Worker) backoffReset() {
   537  	w.failures = 0
   538  }