github.com/kardianos/nomad@v0.1.3-0.20151022182107-b13df73ee850/nomad/worker.go

github.com/kardianos/nomad@v0.1.3-0.20151022182107-b13df73ee850/nomad/worker.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  	"strings"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/armon/go-metrics"
    11  	"github.com/hashicorp/nomad/nomad/structs"
    12  	"github.com/hashicorp/nomad/scheduler"
    13  )
    14  
    15  const (
    16  	// backoffBaselineFast is the baseline time for exponential backoff
    17  	backoffBaselineFast = 20 * time.Millisecond
    18  
    19  	// backoffBaselineSlow is the baseline time for exponential backoff
    20  	// but that is much slower than backoffBaselineFast
    21  	backoffBaselineSlow = 500 * time.Millisecond
    22  
    23  	// backoffLimitFast is the limit of the exponential backoff
    24  	backoffLimitFast = time.Second
    25  
    26  	// backoffLimitSlow is the limit of the exponential backoff for
    27  	// the slower backoff
    28  	backoffLimitSlow = 10 * time.Second
    29  
    30  	// dequeueTimeout is used to timeout an evaluation dequeue so that
    31  	// we can check if there is a shutdown event
    32  	dequeueTimeout = 500 * time.Millisecond
    33  
    34  	// raftSyncLimit is the limit of time we will wait for Raft replication
    35  	// to catch up to the evaluation. This is used to fast Nack and
    36  	// allow another scheduler to pick it up.
    37  	raftSyncLimit = 5 * time.Second
    38  
    39  	// dequeueErrGrace is the grace period where we don't log about
    40  	// dequeue errors after start. This is to improve the user experience
    41  	// in dev mode where the leader isn't elected for a few seconds.
    42  	dequeueErrGrace = 10 * time.Second
    43  )
    44  
    45  // Worker is a single threaded scheduling worker. There may be multiple
    46  // running per server (leader or follower). They are responsible for dequeuing
    47  // pending evaluations, invoking schedulers, plan submission and the
    48  // lifecycle around making task allocations. They bridge the business logic
    49  // of the scheduler with the plumbing required to make it all work.
    50  type Worker struct {
    51  	srv    *Server
    52  	logger *log.Logger
    53  	start  time.Time
    54  
    55  	paused    bool
    56  	pauseLock sync.Mutex
    57  	pauseCond *sync.Cond
    58  
    59  	failures uint
    60  
    61  	evalToken string
    62  }
    63  
    64  // NewWorker starts a new worker associated with the given server
    65  func NewWorker(srv *Server) (*Worker, error) {
    66  	w := &Worker{
    67  		srv:    srv,
    68  		logger: srv.logger,
    69  		start:  time.Now(),
    70  	}
    71  	w.pauseCond = sync.NewCond(&w.pauseLock)
    72  	go w.run()
    73  	return w, nil
    74  }
    75  
    76  // SetPause is used to pause or unpause a worker
    77  func (w *Worker) SetPause(p bool) {
    78  	w.pauseLock.Lock()
    79  	w.paused = p
    80  	w.pauseLock.Unlock()
    81  	if !p {
    82  		w.pauseCond.Broadcast()
    83  	}
    84  }
    85  
    86  // checkPaused is used to park the worker when paused
    87  func (w *Worker) checkPaused() {
    88  	w.pauseLock.Lock()
    89  	for w.paused {
    90  		w.pauseCond.Wait()
    91  	}
    92  	w.pauseLock.Unlock()
    93  }
    94  
    95  // run is the long-lived goroutine which is used to run the worker
    96  func (w *Worker) run() {
    97  	for {
    98  		// Dequeue a pending evaluation
    99  		eval, token, shutdown := w.dequeueEvaluation(dequeueTimeout)
   100  		if shutdown {
   101  			return
   102  		}
   103  
   104  		// Check for a shutdown
   105  		if w.srv.IsShutdown() {
   106  			w.sendAck(eval.ID, token, false)
   107  			return
   108  		}
   109  
   110  		// Wait for the the raft log to catchup to the evaluation
   111  		if err := w.waitForIndex(eval.ModifyIndex, raftSyncLimit); err != nil {
   112  			w.sendAck(eval.ID, token, false)
   113  			continue
   114  		}
   115  
   116  		// Invoke the scheduler to determine placements
   117  		if err := w.invokeScheduler(eval, token); err != nil {
   118  			w.sendAck(eval.ID, token, false)
   119  			continue
   120  		}
   121  
   122  		// Complete the evaluation
   123  		w.sendAck(eval.ID, token, true)
   124  	}
   125  }
   126  
   127  // dequeueEvaluation is used to fetch the next ready evaluation.
   128  // This blocks until an evaluation is available or a timeout is reached.
   129  func (w *Worker) dequeueEvaluation(timeout time.Duration) (*structs.Evaluation, string, bool) {
   130  	// Setup the request
   131  	req := structs.EvalDequeueRequest{
   132  		Schedulers: w.srv.config.EnabledSchedulers,
   133  		Timeout:    timeout,
   134  		WriteRequest: structs.WriteRequest{
   135  			Region: w.srv.config.Region,
   136  		},
   137  	}
   138  	var resp structs.EvalDequeueResponse
   139  
   140  REQ:
   141  	// Check if we are paused
   142  	w.checkPaused()
   143  
   144  	// Make a blocking RPC
   145  	start := time.Now()
   146  	err := w.srv.RPC("Eval.Dequeue", &req, &resp)
   147  	metrics.MeasureSince([]string{"nomad", "worker", "dequeue_eval"}, start)
   148  	if err != nil {
   149  		if time.Since(w.start) > dequeueErrGrace && !w.srv.IsShutdown() {
   150  			w.logger.Printf("[ERR] worker: failed to dequeue evaluation: %v", err)
   151  		}
   152  		if w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   153  			return nil, "", true
   154  		}
   155  		goto REQ
   156  	}
   157  	w.backoffReset()
   158  
   159  	// Check if we got a response
   160  	if resp.Eval != nil {
   161  		w.logger.Printf("[DEBUG] worker: dequeued evaluation %s", resp.Eval.ID)
   162  		return resp.Eval, resp.Token, false
   163  	}
   164  
   165  	// Check for potential shutdown
   166  	if w.srv.IsShutdown() {
   167  		return nil, "", true
   168  	}
   169  	goto REQ
   170  }
   171  
   172  // sendAck makes a best effort to ack or nack the evaluation.
   173  // Any errors are logged but swallowed.
   174  func (w *Worker) sendAck(evalID, token string, ack bool) {
   175  	defer metrics.MeasureSince([]string{"nomad", "worker", "send_ack"}, time.Now())
   176  	// Setup the request
   177  	req := structs.EvalAckRequest{
   178  		EvalID: evalID,
   179  		Token:  token,
   180  		WriteRequest: structs.WriteRequest{
   181  			Region: w.srv.config.Region,
   182  		},
   183  	}
   184  	var resp structs.GenericResponse
   185  
   186  	// Determine if this is an Ack or Nack
   187  	verb := "ack"
   188  	endpoint := "Eval.Ack"
   189  	if !ack {
   190  		verb = "nack"
   191  		endpoint = "Eval.Nack"
   192  	}
   193  
   194  	// Make the RPC call
   195  	err := w.srv.RPC(endpoint, &req, &resp)
   196  	if err != nil {
   197  		w.logger.Printf("[ERR] worker: failed to %s evaluation '%s': %v",
   198  			verb, evalID, err)
   199  	} else {
   200  		w.logger.Printf("[DEBUG] worker: %s for evaluation %s", verb, evalID)
   201  	}
   202  }
   203  
   204  // waitForIndex ensures that the local state is at least as fresh
   205  // as the given index. This is used before starting an evaluation,
   206  // but also potentially mid-stream. If a Plan fails because of stale
   207  // state (attempt to allocate to a failed/dead node), we may need
   208  // to sync our state again and do the planning with more recent data.
   209  func (w *Worker) waitForIndex(index uint64, timeout time.Duration) error {
   210  	start := time.Now()
   211  	defer metrics.MeasureSince([]string{"nomad", "worker", "wait_for_index"}, start)
   212  CHECK:
   213  	// We only need the FSM state to be as recent as the given index
   214  	appliedIndex := w.srv.raft.AppliedIndex()
   215  	if index <= appliedIndex {
   216  		w.backoffReset()
   217  		return nil
   218  	}
   219  
   220  	// Check if we've reached our limit
   221  	if time.Now().Sub(start) > timeout {
   222  		return fmt.Errorf("sync wait timeout reached")
   223  	}
   224  
   225  	// Exponential back off if we haven't yet reached it
   226  	if w.backoffErr(backoffBaselineFast, backoffLimitFast) {
   227  		return fmt.Errorf("shutdown while waiting for state sync")
   228  	}
   229  	goto CHECK
   230  }
   231  
   232  // invokeScheduler is used to invoke the business logic of the scheduler
   233  func (w *Worker) invokeScheduler(eval *structs.Evaluation, token string) error {
   234  	defer metrics.MeasureSince([]string{"nomad", "worker", "invoke_scheduler", eval.Type}, time.Now())
   235  	// Store the evaluation token
   236  	w.evalToken = token
   237  
   238  	// Snapshot the current state
   239  	snap, err := w.srv.fsm.State().Snapshot()
   240  	if err != nil {
   241  		return fmt.Errorf("failed to snapshot state: %v", err)
   242  	}
   243  
   244  	// Create the scheduler, or use the special system scheduler
   245  	var sched scheduler.Scheduler
   246  	if eval.Type == structs.JobTypeCore {
   247  		sched = NewCoreScheduler(w.srv, snap)
   248  	} else {
   249  		sched, err = scheduler.NewScheduler(eval.Type, w.logger, snap, w)
   250  		if err != nil {
   251  			return fmt.Errorf("failed to instantiate scheduler: %v", err)
   252  		}
   253  	}
   254  
   255  	// Process the evaluation
   256  	err = sched.Process(eval)
   257  	if err != nil {
   258  		return fmt.Errorf("failed to process evaluation: %v", err)
   259  	}
   260  	return nil
   261  }
   262  
   263  // SubmitPlan is used to submit a plan for consideration. This allows
   264  // the worker to act as the planner for the scheduler.
   265  func (w *Worker) SubmitPlan(plan *structs.Plan) (*structs.PlanResult, scheduler.State, error) {
   266  	// Check for a shutdown before plan submission
   267  	if w.srv.IsShutdown() {
   268  		return nil, nil, fmt.Errorf("shutdown while planning")
   269  	}
   270  	defer metrics.MeasureSince([]string{"nomad", "worker", "submit_plan"}, time.Now())
   271  
   272  	// Add the evaluation token to the plan
   273  	plan.EvalToken = w.evalToken
   274  
   275  	// Setup the request
   276  	req := structs.PlanRequest{
   277  		Plan: plan,
   278  		WriteRequest: structs.WriteRequest{
   279  			Region: w.srv.config.Region,
   280  		},
   281  	}
   282  	var resp structs.PlanResponse
   283  
   284  SUBMIT:
   285  	// Make the RPC call
   286  	if err := w.srv.RPC("Plan.Submit", &req, &resp); err != nil {
   287  		w.logger.Printf("[ERR] worker: failed to submit plan for evaluation %s: %v",
   288  			plan.EvalID, err)
   289  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   290  			goto SUBMIT
   291  		}
   292  		return nil, nil, err
   293  	} else {
   294  		w.logger.Printf("[DEBUG] worker: submitted plan for evaluation %s", plan.EvalID)
   295  		w.backoffReset()
   296  	}
   297  
   298  	// Look for a result
   299  	result := resp.Result
   300  	if result == nil {
   301  		return nil, nil, fmt.Errorf("missing result")
   302  	}
   303  
   304  	// Check if a state update is required. This could be required if we
   305  	// planning based on stale data, which is causing issues. For example, a
   306  	// node failure since the time we've started planning or conflicting task
   307  	// allocations.
   308  	var state scheduler.State
   309  	if result.RefreshIndex != 0 {
   310  		// Wait for the the raft log to catchup to the evaluation
   311  		w.logger.Printf("[DEBUG] worker: refreshing state to index %d", result.RefreshIndex)
   312  		if err := w.waitForIndex(result.RefreshIndex, raftSyncLimit); err != nil {
   313  			return nil, nil, err
   314  		}
   315  
   316  		// Snapshot the current state
   317  		snap, err := w.srv.fsm.State().Snapshot()
   318  		if err != nil {
   319  			return nil, nil, fmt.Errorf("failed to snapshot state: %v", err)
   320  		}
   321  		state = snap
   322  	}
   323  
   324  	// Return the result and potential state update
   325  	return result, state, nil
   326  }
   327  
   328  // UpdateEval is used to submit an updated evaluation. This allows
   329  // the worker to act as the planner for the scheduler.
   330  func (w *Worker) UpdateEval(eval *structs.Evaluation) error {
   331  	// Check for a shutdown before plan submission
   332  	if w.srv.IsShutdown() {
   333  		return fmt.Errorf("shutdown while planning")
   334  	}
   335  	defer metrics.MeasureSince([]string{"nomad", "worker", "update_eval"}, time.Now())
   336  
   337  	// Setup the request
   338  	req := structs.EvalUpdateRequest{
   339  		Evals:     []*structs.Evaluation{eval},
   340  		EvalToken: w.evalToken,
   341  		WriteRequest: structs.WriteRequest{
   342  			Region: w.srv.config.Region,
   343  		},
   344  	}
   345  	var resp structs.GenericResponse
   346  
   347  SUBMIT:
   348  	// Make the RPC call
   349  	if err := w.srv.RPC("Eval.Update", &req, &resp); err != nil {
   350  		w.logger.Printf("[ERR] worker: failed to update evaluation %#v: %v",
   351  			eval, err)
   352  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   353  			goto SUBMIT
   354  		}
   355  		return err
   356  	} else {
   357  		w.logger.Printf("[DEBUG] worker: updated evaluation %#v", eval)
   358  		w.backoffReset()
   359  	}
   360  	return nil
   361  }
   362  
   363  // CreateEval is used to create a new evaluation. This allows
   364  // the worker to act as the planner for the scheduler.
   365  func (w *Worker) CreateEval(eval *structs.Evaluation) error {
   366  	// Check for a shutdown before plan submission
   367  	if w.srv.IsShutdown() {
   368  		return fmt.Errorf("shutdown while planning")
   369  	}
   370  	defer metrics.MeasureSince([]string{"nomad", "worker", "create_eval"}, time.Now())
   371  
   372  	// Setup the request
   373  	req := structs.EvalUpdateRequest{
   374  		Evals:     []*structs.Evaluation{eval},
   375  		EvalToken: w.evalToken,
   376  		WriteRequest: structs.WriteRequest{
   377  			Region: w.srv.config.Region,
   378  		},
   379  	}
   380  	var resp structs.GenericResponse
   381  
   382  SUBMIT:
   383  	// Make the RPC call
   384  	if err := w.srv.RPC("Eval.Create", &req, &resp); err != nil {
   385  		w.logger.Printf("[ERR] worker: failed to create evaluation %#v: %v",
   386  			eval, err)
   387  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   388  			goto SUBMIT
   389  		}
   390  		return err
   391  	} else {
   392  		w.logger.Printf("[DEBUG] worker: created evaluation %#v", eval)
   393  		w.backoffReset()
   394  	}
   395  	return nil
   396  }
   397  
   398  // shouldResubmit checks if a given error should be swallowed and the plan
   399  // resubmitted after a backoff. Usually these are transient errors that
   400  // the cluster should heal from quickly.
   401  func (w *Worker) shouldResubmit(err error) bool {
   402  	s := err.Error()
   403  	switch {
   404  	case strings.Contains(s, "No cluster leader"):
   405  		return true
   406  	case strings.Contains(s, "plan queue is disabled"):
   407  		return true
   408  	default:
   409  		return false
   410  	}
   411  }
   412  
   413  // backoffErr is used to do an exponential back off on error. This is
   414  // maintained statefully for the worker. Returns if attempts should be
   415  // abandoneded due to shutdown.
   416  // be made or abandoned.
   417  func (w *Worker) backoffErr(base, limit time.Duration) bool {
   418  	backoff := (1 << (2 * w.failures)) * base
   419  	if backoff > limit {
   420  		backoff = limit
   421  	} else {
   422  		w.failures++
   423  	}
   424  	select {
   425  	case <-time.After(backoff):
   426  		return false
   427  	case <-w.srv.shutdownCh:
   428  		return true
   429  	}
   430  }
   431  
   432  // backoffReset is used to reset the failure count for
   433  // exponential backoff
   434  func (w *Worker) backoffReset() {
   435  	w.failures = 0
   436  }