github.com/dkerwin/nomad@v0.3.3-0.20160525181927-74554135514b/nomad/worker.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  	"strings"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/armon/go-metrics"
    11  	"github.com/hashicorp/nomad/nomad/structs"
    12  	"github.com/hashicorp/nomad/scheduler"
    13  )
    14  
    15  const (
    16  	// backoffBaselineFast is the baseline time for exponential backoff
    17  	backoffBaselineFast = 20 * time.Millisecond
    18  
    19  	// backoffBaselineSlow is the baseline time for exponential backoff
    20  	// but that is much slower than backoffBaselineFast
    21  	backoffBaselineSlow = 500 * time.Millisecond
    22  
    23  	// backoffLimitFast is the limit of the exponential backoff
    24  	backoffLimitFast = time.Second
    25  
    26  	// backoffLimitSlow is the limit of the exponential backoff for
    27  	// the slower backoff
    28  	backoffLimitSlow = 10 * time.Second
    29  
    30  	// dequeueTimeout is used to timeout an evaluation dequeue so that
    31  	// we can check if there is a shutdown event
    32  	dequeueTimeout = 500 * time.Millisecond
    33  
    34  	// raftSyncLimit is the limit of time we will wait for Raft replication
    35  	// to catch up to the evaluation. This is used to fast Nack and
    36  	// allow another scheduler to pick it up.
    37  	raftSyncLimit = 5 * time.Second
    38  
    39  	// dequeueErrGrace is the grace period where we don't log about
    40  	// dequeue errors after start. This is to improve the user experience
    41  	// in dev mode where the leader isn't elected for a few seconds.
    42  	dequeueErrGrace = 10 * time.Second
    43  )
    44  
    45  // Worker is a single threaded scheduling worker. There may be multiple
    46  // running per server (leader or follower). They are responsible for dequeuing
    47  // pending evaluations, invoking schedulers, plan submission and the
    48  // lifecycle around making task allocations. They bridge the business logic
    49  // of the scheduler with the plumbing required to make it all work.
    50  type Worker struct {
    51  	srv    *Server
    52  	logger *log.Logger
    53  	start  time.Time
    54  
    55  	paused    bool
    56  	pauseLock sync.Mutex
    57  	pauseCond *sync.Cond
    58  
    59  	failures uint
    60  
    61  	evalToken string
    62  
    63  	// snapshotIndex is the index of the snapshot in which the scheduler was
    64  	// first envoked. It is used to mark the SnapshotIndex of evaluations
    65  	// Created, Updated or Reblocked.
    66  	snapshotIndex uint64
    67  }
    68  
    69  // NewWorker starts a new worker associated with the given server
    70  func NewWorker(srv *Server) (*Worker, error) {
    71  	w := &Worker{
    72  		srv:    srv,
    73  		logger: srv.logger,
    74  		start:  time.Now(),
    75  	}
    76  	w.pauseCond = sync.NewCond(&w.pauseLock)
    77  	go w.run()
    78  	return w, nil
    79  }
    80  
    81  // SetPause is used to pause or unpause a worker
    82  func (w *Worker) SetPause(p bool) {
    83  	w.pauseLock.Lock()
    84  	w.paused = p
    85  	w.pauseLock.Unlock()
    86  	if !p {
    87  		w.pauseCond.Broadcast()
    88  	}
    89  }
    90  
    91  // checkPaused is used to park the worker when paused
    92  func (w *Worker) checkPaused() {
    93  	w.pauseLock.Lock()
    94  	for w.paused {
    95  		w.pauseCond.Wait()
    96  	}
    97  	w.pauseLock.Unlock()
    98  }
    99  
   100  // run is the long-lived goroutine which is used to run the worker
   101  func (w *Worker) run() {
   102  	for {
   103  		// Dequeue a pending evaluation
   104  		eval, token, shutdown := w.dequeueEvaluation(dequeueTimeout)
   105  		if shutdown {
   106  			return
   107  		}
   108  
   109  		// Check for a shutdown
   110  		if w.srv.IsShutdown() {
   111  			w.sendAck(eval.ID, token, false)
   112  			return
   113  		}
   114  
   115  		// Wait for the the raft log to catchup to the evaluation
   116  		if err := w.waitForIndex(eval.ModifyIndex, raftSyncLimit); err != nil {
   117  			w.sendAck(eval.ID, token, false)
   118  			continue
   119  		}
   120  
   121  		// Invoke the scheduler to determine placements
   122  		if err := w.invokeScheduler(eval, token); err != nil {
   123  			w.sendAck(eval.ID, token, false)
   124  			continue
   125  		}
   126  
   127  		// Complete the evaluation
   128  		w.sendAck(eval.ID, token, true)
   129  	}
   130  }
   131  
   132  // dequeueEvaluation is used to fetch the next ready evaluation.
   133  // This blocks until an evaluation is available or a timeout is reached.
   134  func (w *Worker) dequeueEvaluation(timeout time.Duration) (*structs.Evaluation, string, bool) {
   135  	// Setup the request
   136  	req := structs.EvalDequeueRequest{
   137  		Schedulers: w.srv.config.EnabledSchedulers,
   138  		Timeout:    timeout,
   139  		WriteRequest: structs.WriteRequest{
   140  			Region: w.srv.config.Region,
   141  		},
   142  	}
   143  	var resp structs.EvalDequeueResponse
   144  
   145  REQ:
   146  	// Check if we are paused
   147  	w.checkPaused()
   148  
   149  	// Make a blocking RPC
   150  	start := time.Now()
   151  	err := w.srv.RPC("Eval.Dequeue", &req, &resp)
   152  	metrics.MeasureSince([]string{"nomad", "worker", "dequeue_eval"}, start)
   153  	if err != nil {
   154  		if time.Since(w.start) > dequeueErrGrace && !w.srv.IsShutdown() {
   155  			w.logger.Printf("[ERR] worker: failed to dequeue evaluation: %v", err)
   156  		}
   157  		if w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   158  			return nil, "", true
   159  		}
   160  		goto REQ
   161  	}
   162  	w.backoffReset()
   163  
   164  	// Check if we got a response
   165  	if resp.Eval != nil {
   166  		w.logger.Printf("[DEBUG] worker: dequeued evaluation %s", resp.Eval.ID)
   167  		return resp.Eval, resp.Token, false
   168  	}
   169  
   170  	// Check for potential shutdown
   171  	if w.srv.IsShutdown() {
   172  		return nil, "", true
   173  	}
   174  	goto REQ
   175  }
   176  
   177  // sendAck makes a best effort to ack or nack the evaluation.
   178  // Any errors are logged but swallowed.
   179  func (w *Worker) sendAck(evalID, token string, ack bool) {
   180  	defer metrics.MeasureSince([]string{"nomad", "worker", "send_ack"}, time.Now())
   181  	// Setup the request
   182  	req := structs.EvalAckRequest{
   183  		EvalID: evalID,
   184  		Token:  token,
   185  		WriteRequest: structs.WriteRequest{
   186  			Region: w.srv.config.Region,
   187  		},
   188  	}
   189  	var resp structs.GenericResponse
   190  
   191  	// Determine if this is an Ack or Nack
   192  	verb := "ack"
   193  	endpoint := "Eval.Ack"
   194  	if !ack {
   195  		verb = "nack"
   196  		endpoint = "Eval.Nack"
   197  	}
   198  
   199  	// Make the RPC call
   200  	err := w.srv.RPC(endpoint, &req, &resp)
   201  	if err != nil {
   202  		w.logger.Printf("[ERR] worker: failed to %s evaluation '%s': %v",
   203  			verb, evalID, err)
   204  	} else {
   205  		w.logger.Printf("[DEBUG] worker: %s for evaluation %s", verb, evalID)
   206  	}
   207  }
   208  
   209  // waitForIndex ensures that the local state is at least as fresh
   210  // as the given index. This is used before starting an evaluation,
   211  // but also potentially mid-stream. If a Plan fails because of stale
   212  // state (attempt to allocate to a failed/dead node), we may need
   213  // to sync our state again and do the planning with more recent data.
   214  func (w *Worker) waitForIndex(index uint64, timeout time.Duration) error {
   215  	start := time.Now()
   216  	defer metrics.MeasureSince([]string{"nomad", "worker", "wait_for_index"}, start)
   217  CHECK:
   218  	// We only need the FSM state to be as recent as the given index
   219  	appliedIndex := w.srv.raft.AppliedIndex()
   220  	if index <= appliedIndex {
   221  		w.backoffReset()
   222  		return nil
   223  	}
   224  
   225  	// Check if we've reached our limit
   226  	if time.Now().Sub(start) > timeout {
   227  		return fmt.Errorf("sync wait timeout reached")
   228  	}
   229  
   230  	// Exponential back off if we haven't yet reached it
   231  	if w.backoffErr(backoffBaselineFast, backoffLimitFast) {
   232  		return fmt.Errorf("shutdown while waiting for state sync")
   233  	}
   234  	goto CHECK
   235  }
   236  
   237  // invokeScheduler is used to invoke the business logic of the scheduler
   238  func (w *Worker) invokeScheduler(eval *structs.Evaluation, token string) error {
   239  	defer metrics.MeasureSince([]string{"nomad", "worker", "invoke_scheduler", eval.Type}, time.Now())
   240  	// Store the evaluation token
   241  	w.evalToken = token
   242  
   243  	// Snapshot the current state
   244  	snap, err := w.srv.fsm.State().Snapshot()
   245  	if err != nil {
   246  		return fmt.Errorf("failed to snapshot state: %v", err)
   247  	}
   248  
   249  	// Store the snapshot's index
   250  	w.snapshotIndex, err = snap.LatestIndex()
   251  	if err != nil {
   252  		return fmt.Errorf("failed to determine snapshot's index: %v", err)
   253  	}
   254  
   255  	// Create the scheduler, or use the special system scheduler
   256  	var sched scheduler.Scheduler
   257  	if eval.Type == structs.JobTypeCore {
   258  		sched = NewCoreScheduler(w.srv, snap)
   259  	} else {
   260  		sched, err = scheduler.NewScheduler(eval.Type, w.logger, snap, w)
   261  		if err != nil {
   262  			return fmt.Errorf("failed to instantiate scheduler: %v", err)
   263  		}
   264  	}
   265  
   266  	// Process the evaluation
   267  	err = sched.Process(eval)
   268  	if err != nil {
   269  		return fmt.Errorf("failed to process evaluation: %v", err)
   270  	}
   271  	return nil
   272  }
   273  
   274  // SubmitPlan is used to submit a plan for consideration. This allows
   275  // the worker to act as the planner for the scheduler.
   276  func (w *Worker) SubmitPlan(plan *structs.Plan) (*structs.PlanResult, scheduler.State, error) {
   277  	// Check for a shutdown before plan submission
   278  	if w.srv.IsShutdown() {
   279  		return nil, nil, fmt.Errorf("shutdown while planning")
   280  	}
   281  	defer metrics.MeasureSince([]string{"nomad", "worker", "submit_plan"}, time.Now())
   282  
   283  	// Add the evaluation token to the plan
   284  	plan.EvalToken = w.evalToken
   285  
   286  	// Setup the request
   287  	req := structs.PlanRequest{
   288  		Plan: plan,
   289  		WriteRequest: structs.WriteRequest{
   290  			Region: w.srv.config.Region,
   291  		},
   292  	}
   293  	var resp structs.PlanResponse
   294  
   295  SUBMIT:
   296  	// Make the RPC call
   297  	if err := w.srv.RPC("Plan.Submit", &req, &resp); err != nil {
   298  		w.logger.Printf("[ERR] worker: failed to submit plan for evaluation %s: %v",
   299  			plan.EvalID, err)
   300  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   301  			goto SUBMIT
   302  		}
   303  		return nil, nil, err
   304  	} else {
   305  		w.logger.Printf("[DEBUG] worker: submitted plan for evaluation %s", plan.EvalID)
   306  		w.backoffReset()
   307  	}
   308  
   309  	// Look for a result
   310  	result := resp.Result
   311  	if result == nil {
   312  		return nil, nil, fmt.Errorf("missing result")
   313  	}
   314  
   315  	// Check if a state update is required. This could be required if we
   316  	// planning based on stale data, which is causing issues. For example, a
   317  	// node failure since the time we've started planning or conflicting task
   318  	// allocations.
   319  	var state scheduler.State
   320  	if result.RefreshIndex != 0 {
   321  		// Wait for the the raft log to catchup to the evaluation
   322  		w.logger.Printf("[DEBUG] worker: refreshing state to index %d", result.RefreshIndex)
   323  		if err := w.waitForIndex(result.RefreshIndex, raftSyncLimit); err != nil {
   324  			return nil, nil, err
   325  		}
   326  
   327  		// Snapshot the current state
   328  		snap, err := w.srv.fsm.State().Snapshot()
   329  		if err != nil {
   330  			return nil, nil, fmt.Errorf("failed to snapshot state: %v", err)
   331  		}
   332  		state = snap
   333  	}
   334  
   335  	// Return the result and potential state update
   336  	return result, state, nil
   337  }
   338  
   339  // UpdateEval is used to submit an updated evaluation. This allows
   340  // the worker to act as the planner for the scheduler.
   341  func (w *Worker) UpdateEval(eval *structs.Evaluation) error {
   342  	// Check for a shutdown before plan submission
   343  	if w.srv.IsShutdown() {
   344  		return fmt.Errorf("shutdown while planning")
   345  	}
   346  	defer metrics.MeasureSince([]string{"nomad", "worker", "update_eval"}, time.Now())
   347  
   348  	// Store the snapshot index in the eval
   349  	eval.SnapshotIndex = w.snapshotIndex
   350  
   351  	// Setup the request
   352  	req := structs.EvalUpdateRequest{
   353  		Evals:     []*structs.Evaluation{eval},
   354  		EvalToken: w.evalToken,
   355  		WriteRequest: structs.WriteRequest{
   356  			Region: w.srv.config.Region,
   357  		},
   358  	}
   359  	var resp structs.GenericResponse
   360  
   361  SUBMIT:
   362  	// Make the RPC call
   363  	if err := w.srv.RPC("Eval.Update", &req, &resp); err != nil {
   364  		w.logger.Printf("[ERR] worker: failed to update evaluation %#v: %v",
   365  			eval, err)
   366  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   367  			goto SUBMIT
   368  		}
   369  		return err
   370  	} else {
   371  		w.logger.Printf("[DEBUG] worker: updated evaluation %#v", eval)
   372  		w.backoffReset()
   373  	}
   374  	return nil
   375  }
   376  
   377  // CreateEval is used to create a new evaluation. This allows
   378  // the worker to act as the planner for the scheduler.
   379  func (w *Worker) CreateEval(eval *structs.Evaluation) error {
   380  	// Check for a shutdown before plan submission
   381  	if w.srv.IsShutdown() {
   382  		return fmt.Errorf("shutdown while planning")
   383  	}
   384  	defer metrics.MeasureSince([]string{"nomad", "worker", "create_eval"}, time.Now())
   385  
   386  	// Store the snapshot index in the eval
   387  	eval.SnapshotIndex = w.snapshotIndex
   388  
   389  	// Setup the request
   390  	req := structs.EvalUpdateRequest{
   391  		Evals:     []*structs.Evaluation{eval},
   392  		EvalToken: w.evalToken,
   393  		WriteRequest: structs.WriteRequest{
   394  			Region: w.srv.config.Region,
   395  		},
   396  	}
   397  	var resp structs.GenericResponse
   398  
   399  SUBMIT:
   400  	// Make the RPC call
   401  	if err := w.srv.RPC("Eval.Create", &req, &resp); err != nil {
   402  		w.logger.Printf("[ERR] worker: failed to create evaluation %#v: %v",
   403  			eval, err)
   404  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   405  			goto SUBMIT
   406  		}
   407  		return err
   408  	} else {
   409  		w.logger.Printf("[DEBUG] worker: created evaluation %#v", eval)
   410  		w.backoffReset()
   411  	}
   412  	return nil
   413  }
   414  
   415  // ReblockEval is used to reinsert a blocked evaluation into the blocked eval
   416  // tracker. This allows the worker to act as the planner for the scheduler.
   417  func (w *Worker) ReblockEval(eval *structs.Evaluation) error {
   418  	// Check for a shutdown before plan submission
   419  	if w.srv.IsShutdown() {
   420  		return fmt.Errorf("shutdown while planning")
   421  	}
   422  	defer metrics.MeasureSince([]string{"nomad", "worker", "reblock_eval"}, time.Now())
   423  
   424  	// Store the snapshot index in the eval
   425  	eval.SnapshotIndex = w.snapshotIndex
   426  
   427  	// Setup the request
   428  	req := structs.EvalUpdateRequest{
   429  		Evals:     []*structs.Evaluation{eval},
   430  		EvalToken: w.evalToken,
   431  		WriteRequest: structs.WriteRequest{
   432  			Region: w.srv.config.Region,
   433  		},
   434  	}
   435  	var resp structs.GenericResponse
   436  
   437  SUBMIT:
   438  	// Make the RPC call
   439  	if err := w.srv.RPC("Eval.Reblock", &req, &resp); err != nil {
   440  		w.logger.Printf("[ERR] worker: failed to reblock evaluation %#v: %v",
   441  			eval, err)
   442  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   443  			goto SUBMIT
   444  		}
   445  		return err
   446  	} else {
   447  		w.logger.Printf("[DEBUG] worker: reblocked evaluation %#v", eval)
   448  		w.backoffReset()
   449  	}
   450  	return nil
   451  }
   452  
   453  // shouldResubmit checks if a given error should be swallowed and the plan
   454  // resubmitted after a backoff. Usually these are transient errors that
   455  // the cluster should heal from quickly.
   456  func (w *Worker) shouldResubmit(err error) bool {
   457  	s := err.Error()
   458  	switch {
   459  	case strings.Contains(s, "No cluster leader"):
   460  		return true
   461  	case strings.Contains(s, "plan queue is disabled"):
   462  		return true
   463  	default:
   464  		return false
   465  	}
   466  }
   467  
   468  // backoffErr is used to do an exponential back off on error. This is
   469  // maintained statefully for the worker. Returns if attempts should be
   470  // abandoneded due to shutdown.
   471  func (w *Worker) backoffErr(base, limit time.Duration) bool {
   472  	backoff := (1 << (2 * w.failures)) * base
   473  	if backoff > limit {
   474  		backoff = limit
   475  	} else {
   476  		w.failures++
   477  	}
   478  	select {
   479  	case <-time.After(backoff):
   480  		return false
   481  	case <-w.srv.shutdownCh:
   482  		return true
   483  	}
   484  }
   485  
   486  // backoffReset is used to reset the failure count for
   487  // exponential backoff
   488  func (w *Worker) backoffReset() {
   489  	w.failures = 0
   490  }