github.com/maier/nomad@v0.4.1-0.20161110003312-a9e3d0b8549d/nomad/worker.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  	"strings"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/armon/go-metrics"
    11  	"github.com/hashicorp/nomad/nomad/structs"
    12  	"github.com/hashicorp/nomad/scheduler"
    13  )
    14  
    15  const (
    16  	// backoffBaselineFast is the baseline time for exponential backoff
    17  	backoffBaselineFast = 20 * time.Millisecond
    18  
    19  	// backoffBaselineSlow is the baseline time for exponential backoff
    20  	// but that is much slower than backoffBaselineFast
    21  	backoffBaselineSlow = 500 * time.Millisecond
    22  
    23  	// backoffLimitFast is the limit of the exponential backoff
    24  	backoffLimitFast = time.Second
    25  
    26  	// backoffLimitSlow is the limit of the exponential backoff for
    27  	// the slower backoff
    28  	backoffLimitSlow = 10 * time.Second
    29  
    30  	// backoffSchedulerVersionMismatch is the backoff between retries when the
    31  	// scheduler version mismatches that of the leader.
    32  	backoffSchedulerVersionMismatch = 30 * time.Second
    33  
    34  	// dequeueTimeout is used to timeout an evaluation dequeue so that
    35  	// we can check if there is a shutdown event
    36  	dequeueTimeout = 500 * time.Millisecond
    37  
    38  	// raftSyncLimit is the limit of time we will wait for Raft replication
    39  	// to catch up to the evaluation. This is used to fast Nack and
    40  	// allow another scheduler to pick it up.
    41  	raftSyncLimit = 5 * time.Second
    42  
    43  	// dequeueErrGrace is the grace period where we don't log about
    44  	// dequeue errors after start. This is to improve the user experience
    45  	// in dev mode where the leader isn't elected for a few seconds.
    46  	dequeueErrGrace = 10 * time.Second
    47  )
    48  
    49  // Worker is a single threaded scheduling worker. There may be multiple
    50  // running per server (leader or follower). They are responsible for dequeuing
    51  // pending evaluations, invoking schedulers, plan submission and the
    52  // lifecycle around making task allocations. They bridge the business logic
    53  // of the scheduler with the plumbing required to make it all work.
    54  type Worker struct {
    55  	srv    *Server
    56  	logger *log.Logger
    57  	start  time.Time
    58  
    59  	paused    bool
    60  	pauseLock sync.Mutex
    61  	pauseCond *sync.Cond
    62  
    63  	failures uint
    64  
    65  	evalToken string
    66  
    67  	// snapshotIndex is the index of the snapshot in which the scheduler was
    68  	// first envoked. It is used to mark the SnapshotIndex of evaluations
    69  	// Created, Updated or Reblocked.
    70  	snapshotIndex uint64
    71  }
    72  
    73  // NewWorker starts a new worker associated with the given server
    74  func NewWorker(srv *Server) (*Worker, error) {
    75  	w := &Worker{
    76  		srv:    srv,
    77  		logger: srv.logger,
    78  		start:  time.Now(),
    79  	}
    80  	w.pauseCond = sync.NewCond(&w.pauseLock)
    81  	go w.run()
    82  	return w, nil
    83  }
    84  
    85  // SetPause is used to pause or unpause a worker
    86  func (w *Worker) SetPause(p bool) {
    87  	w.pauseLock.Lock()
    88  	w.paused = p
    89  	w.pauseLock.Unlock()
    90  	if !p {
    91  		w.pauseCond.Broadcast()
    92  	}
    93  }
    94  
    95  // checkPaused is used to park the worker when paused
    96  func (w *Worker) checkPaused() {
    97  	w.pauseLock.Lock()
    98  	for w.paused {
    99  		w.pauseCond.Wait()
   100  	}
   101  	w.pauseLock.Unlock()
   102  }
   103  
   104  // run is the long-lived goroutine which is used to run the worker
   105  func (w *Worker) run() {
   106  	for {
   107  		// Dequeue a pending evaluation
   108  		eval, token, shutdown := w.dequeueEvaluation(dequeueTimeout)
   109  		if shutdown {
   110  			return
   111  		}
   112  
   113  		// Check for a shutdown
   114  		if w.srv.IsShutdown() {
   115  			w.sendAck(eval.ID, token, false)
   116  			return
   117  		}
   118  
   119  		// Wait for the raft log to catchup to the evaluation
   120  		if err := w.waitForIndex(eval.ModifyIndex, raftSyncLimit); err != nil {
   121  			w.sendAck(eval.ID, token, false)
   122  			continue
   123  		}
   124  
   125  		// Invoke the scheduler to determine placements
   126  		if err := w.invokeScheduler(eval, token); err != nil {
   127  			w.sendAck(eval.ID, token, false)
   128  			continue
   129  		}
   130  
   131  		// Complete the evaluation
   132  		w.sendAck(eval.ID, token, true)
   133  	}
   134  }
   135  
   136  // dequeueEvaluation is used to fetch the next ready evaluation.
   137  // This blocks until an evaluation is available or a timeout is reached.
   138  func (w *Worker) dequeueEvaluation(timeout time.Duration) (*structs.Evaluation, string, bool) {
   139  	// Setup the request
   140  	req := structs.EvalDequeueRequest{
   141  		Schedulers:       w.srv.config.EnabledSchedulers,
   142  		Timeout:          timeout,
   143  		SchedulerVersion: scheduler.SchedulerVersion,
   144  		WriteRequest: structs.WriteRequest{
   145  			Region: w.srv.config.Region,
   146  		},
   147  	}
   148  	var resp structs.EvalDequeueResponse
   149  
   150  REQ:
   151  	// Check if we are paused
   152  	w.checkPaused()
   153  
   154  	// Make a blocking RPC
   155  	start := time.Now()
   156  	err := w.srv.RPC("Eval.Dequeue", &req, &resp)
   157  	metrics.MeasureSince([]string{"nomad", "worker", "dequeue_eval"}, start)
   158  	if err != nil {
   159  		if time.Since(w.start) > dequeueErrGrace && !w.srv.IsShutdown() {
   160  			w.logger.Printf("[ERR] worker: failed to dequeue evaluation: %v", err)
   161  		}
   162  
   163  		// Adjust the backoff based on the error. If it is a scheduler version
   164  		// mismatch we increase the baseline.
   165  		base, limit := backoffBaselineFast, backoffLimitSlow
   166  		if strings.Contains(err.Error(), "calling scheduler version") {
   167  			base = backoffSchedulerVersionMismatch
   168  			limit = backoffSchedulerVersionMismatch
   169  		}
   170  
   171  		if w.backoffErr(base, limit) {
   172  			return nil, "", true
   173  		}
   174  		goto REQ
   175  	}
   176  	w.backoffReset()
   177  
   178  	// Check if we got a response
   179  	if resp.Eval != nil {
   180  		w.logger.Printf("[DEBUG] worker: dequeued evaluation %s", resp.Eval.ID)
   181  		return resp.Eval, resp.Token, false
   182  	}
   183  
   184  	// Check for potential shutdown
   185  	if w.srv.IsShutdown() {
   186  		return nil, "", true
   187  	}
   188  	goto REQ
   189  }
   190  
   191  // sendAck makes a best effort to ack or nack the evaluation.
   192  // Any errors are logged but swallowed.
   193  func (w *Worker) sendAck(evalID, token string, ack bool) {
   194  	defer metrics.MeasureSince([]string{"nomad", "worker", "send_ack"}, time.Now())
   195  	// Setup the request
   196  	req := structs.EvalAckRequest{
   197  		EvalID: evalID,
   198  		Token:  token,
   199  		WriteRequest: structs.WriteRequest{
   200  			Region: w.srv.config.Region,
   201  		},
   202  	}
   203  	var resp structs.GenericResponse
   204  
   205  	// Determine if this is an Ack or Nack
   206  	verb := "ack"
   207  	endpoint := "Eval.Ack"
   208  	if !ack {
   209  		verb = "nack"
   210  		endpoint = "Eval.Nack"
   211  	}
   212  
   213  	// Make the RPC call
   214  	err := w.srv.RPC(endpoint, &req, &resp)
   215  	if err != nil {
   216  		w.logger.Printf("[ERR] worker: failed to %s evaluation '%s': %v",
   217  			verb, evalID, err)
   218  	} else {
   219  		w.logger.Printf("[DEBUG] worker: %s for evaluation %s", verb, evalID)
   220  	}
   221  }
   222  
   223  // waitForIndex ensures that the local state is at least as fresh
   224  // as the given index. This is used before starting an evaluation,
   225  // but also potentially mid-stream. If a Plan fails because of stale
   226  // state (attempt to allocate to a failed/dead node), we may need
   227  // to sync our state again and do the planning with more recent data.
   228  func (w *Worker) waitForIndex(index uint64, timeout time.Duration) error {
   229  	// XXX: Potential optimization is to set up a watch on the state stores
   230  	// index table and only unblock via a trigger rather than timing out and
   231  	// checking.
   232  
   233  	start := time.Now()
   234  	defer metrics.MeasureSince([]string{"nomad", "worker", "wait_for_index"}, start)
   235  CHECK:
   236  	// Get the states current index
   237  	snapshotIndex, err := w.srv.fsm.State().LatestIndex()
   238  	if err != nil {
   239  		return fmt.Errorf("failed to determine state store's index: %v", err)
   240  	}
   241  
   242  	// We only need the FSM state to be as recent as the given index
   243  	if index <= snapshotIndex {
   244  		w.backoffReset()
   245  		return nil
   246  	}
   247  
   248  	// Check if we've reached our limit
   249  	if time.Now().Sub(start) > timeout {
   250  		return fmt.Errorf("sync wait timeout reached")
   251  	}
   252  
   253  	// Exponential back off if we haven't yet reached it
   254  	if w.backoffErr(backoffBaselineFast, backoffLimitFast) {
   255  		return fmt.Errorf("shutdown while waiting for state sync")
   256  	}
   257  	goto CHECK
   258  }
   259  
   260  // invokeScheduler is used to invoke the business logic of the scheduler
   261  func (w *Worker) invokeScheduler(eval *structs.Evaluation, token string) error {
   262  	defer metrics.MeasureSince([]string{"nomad", "worker", "invoke_scheduler", eval.Type}, time.Now())
   263  	// Store the evaluation token
   264  	w.evalToken = token
   265  
   266  	// Snapshot the current state
   267  	snap, err := w.srv.fsm.State().Snapshot()
   268  	if err != nil {
   269  		return fmt.Errorf("failed to snapshot state: %v", err)
   270  	}
   271  
   272  	// Store the snapshot's index
   273  	w.snapshotIndex, err = snap.LatestIndex()
   274  	if err != nil {
   275  		return fmt.Errorf("failed to determine snapshot's index: %v", err)
   276  	}
   277  
   278  	// Create the scheduler, or use the special system scheduler
   279  	var sched scheduler.Scheduler
   280  	if eval.Type == structs.JobTypeCore {
   281  		sched = NewCoreScheduler(w.srv, snap)
   282  	} else {
   283  		sched, err = scheduler.NewScheduler(eval.Type, w.logger, snap, w)
   284  		if err != nil {
   285  			return fmt.Errorf("failed to instantiate scheduler: %v", err)
   286  		}
   287  	}
   288  
   289  	// Process the evaluation
   290  	err = sched.Process(eval)
   291  	if err != nil {
   292  		return fmt.Errorf("failed to process evaluation: %v", err)
   293  	}
   294  	return nil
   295  }
   296  
   297  // SubmitPlan is used to submit a plan for consideration. This allows
   298  // the worker to act as the planner for the scheduler.
   299  func (w *Worker) SubmitPlan(plan *structs.Plan) (*structs.PlanResult, scheduler.State, error) {
   300  	// Check for a shutdown before plan submission
   301  	if w.srv.IsShutdown() {
   302  		return nil, nil, fmt.Errorf("shutdown while planning")
   303  	}
   304  	defer metrics.MeasureSince([]string{"nomad", "worker", "submit_plan"}, time.Now())
   305  
   306  	// Add the evaluation token to the plan
   307  	plan.EvalToken = w.evalToken
   308  
   309  	// Setup the request
   310  	req := structs.PlanRequest{
   311  		Plan: plan,
   312  		WriteRequest: structs.WriteRequest{
   313  			Region: w.srv.config.Region,
   314  		},
   315  	}
   316  	var resp structs.PlanResponse
   317  
   318  SUBMIT:
   319  	// Make the RPC call
   320  	if err := w.srv.RPC("Plan.Submit", &req, &resp); err != nil {
   321  		w.logger.Printf("[ERR] worker: failed to submit plan for evaluation %s: %v",
   322  			plan.EvalID, err)
   323  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   324  			goto SUBMIT
   325  		}
   326  		return nil, nil, err
   327  	} else {
   328  		w.logger.Printf("[DEBUG] worker: submitted plan for evaluation %s", plan.EvalID)
   329  		w.backoffReset()
   330  	}
   331  
   332  	// Look for a result
   333  	result := resp.Result
   334  	if result == nil {
   335  		return nil, nil, fmt.Errorf("missing result")
   336  	}
   337  
   338  	// Check if a state update is required. This could be required if we
   339  	// planning based on stale data, which is causing issues. For example, a
   340  	// node failure since the time we've started planning or conflicting task
   341  	// allocations.
   342  	var state scheduler.State
   343  	if result.RefreshIndex != 0 {
   344  		// Wait for the raft log to catchup to the evaluation
   345  		w.logger.Printf("[DEBUG] worker: refreshing state to index %d for %q", result.RefreshIndex, plan.EvalID)
   346  		if err := w.waitForIndex(result.RefreshIndex, raftSyncLimit); err != nil {
   347  			return nil, nil, err
   348  		}
   349  
   350  		// Snapshot the current state
   351  		snap, err := w.srv.fsm.State().Snapshot()
   352  		if err != nil {
   353  			return nil, nil, fmt.Errorf("failed to snapshot state: %v", err)
   354  		}
   355  		state = snap
   356  	}
   357  
   358  	// Return the result and potential state update
   359  	return result, state, nil
   360  }
   361  
   362  // UpdateEval is used to submit an updated evaluation. This allows
   363  // the worker to act as the planner for the scheduler.
   364  func (w *Worker) UpdateEval(eval *structs.Evaluation) error {
   365  	// Check for a shutdown before plan submission
   366  	if w.srv.IsShutdown() {
   367  		return fmt.Errorf("shutdown while planning")
   368  	}
   369  	defer metrics.MeasureSince([]string{"nomad", "worker", "update_eval"}, time.Now())
   370  
   371  	// Store the snapshot index in the eval
   372  	eval.SnapshotIndex = w.snapshotIndex
   373  
   374  	// Setup the request
   375  	req := structs.EvalUpdateRequest{
   376  		Evals:     []*structs.Evaluation{eval},
   377  		EvalToken: w.evalToken,
   378  		WriteRequest: structs.WriteRequest{
   379  			Region: w.srv.config.Region,
   380  		},
   381  	}
   382  	var resp structs.GenericResponse
   383  
   384  SUBMIT:
   385  	// Make the RPC call
   386  	if err := w.srv.RPC("Eval.Update", &req, &resp); err != nil {
   387  		w.logger.Printf("[ERR] worker: failed to update evaluation %#v: %v",
   388  			eval, err)
   389  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   390  			goto SUBMIT
   391  		}
   392  		return err
   393  	} else {
   394  		w.logger.Printf("[DEBUG] worker: updated evaluation %#v", eval)
   395  		w.backoffReset()
   396  	}
   397  	return nil
   398  }
   399  
   400  // CreateEval is used to create a new evaluation. This allows
   401  // the worker to act as the planner for the scheduler.
   402  func (w *Worker) CreateEval(eval *structs.Evaluation) error {
   403  	// Check for a shutdown before plan submission
   404  	if w.srv.IsShutdown() {
   405  		return fmt.Errorf("shutdown while planning")
   406  	}
   407  	defer metrics.MeasureSince([]string{"nomad", "worker", "create_eval"}, time.Now())
   408  
   409  	// Store the snapshot index in the eval
   410  	eval.SnapshotIndex = w.snapshotIndex
   411  
   412  	// Setup the request
   413  	req := structs.EvalUpdateRequest{
   414  		Evals:     []*structs.Evaluation{eval},
   415  		EvalToken: w.evalToken,
   416  		WriteRequest: structs.WriteRequest{
   417  			Region: w.srv.config.Region,
   418  		},
   419  	}
   420  	var resp structs.GenericResponse
   421  
   422  SUBMIT:
   423  	// Make the RPC call
   424  	if err := w.srv.RPC("Eval.Create", &req, &resp); err != nil {
   425  		w.logger.Printf("[ERR] worker: failed to create evaluation %#v: %v",
   426  			eval, err)
   427  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   428  			goto SUBMIT
   429  		}
   430  		return err
   431  	} else {
   432  		w.logger.Printf("[DEBUG] worker: created evaluation %#v", eval)
   433  		w.backoffReset()
   434  	}
   435  	return nil
   436  }
   437  
   438  // ReblockEval is used to reinsert a blocked evaluation into the blocked eval
   439  // tracker. This allows the worker to act as the planner for the scheduler.
   440  func (w *Worker) ReblockEval(eval *structs.Evaluation) error {
   441  	// Check for a shutdown before plan submission
   442  	if w.srv.IsShutdown() {
   443  		return fmt.Errorf("shutdown while planning")
   444  	}
   445  	defer metrics.MeasureSince([]string{"nomad", "worker", "reblock_eval"}, time.Now())
   446  
   447  	// Update the evaluation if the queued jobs is not same as what is
   448  	// recorded in the job summary
   449  	summary, err := w.srv.fsm.state.JobSummaryByID(eval.JobID)
   450  	if err != nil {
   451  		return fmt.Errorf("couldn't retreive job summary: %v", err)
   452  	}
   453  	if summary != nil {
   454  		var hasChanged bool
   455  		for tg, summary := range summary.Summary {
   456  			if queued, ok := eval.QueuedAllocations[tg]; ok {
   457  				if queued != summary.Queued {
   458  					hasChanged = true
   459  					break
   460  				}
   461  			}
   462  		}
   463  		if hasChanged {
   464  			if err := w.UpdateEval(eval); err != nil {
   465  				return err
   466  			}
   467  		}
   468  	}
   469  
   470  	// Store the snapshot index in the eval
   471  	eval.SnapshotIndex = w.snapshotIndex
   472  
   473  	// Setup the request
   474  	req := structs.EvalUpdateRequest{
   475  		Evals:     []*structs.Evaluation{eval},
   476  		EvalToken: w.evalToken,
   477  		WriteRequest: structs.WriteRequest{
   478  			Region: w.srv.config.Region,
   479  		},
   480  	}
   481  	var resp structs.GenericResponse
   482  
   483  SUBMIT:
   484  	// Make the RPC call
   485  	if err := w.srv.RPC("Eval.Reblock", &req, &resp); err != nil {
   486  		w.logger.Printf("[ERR] worker: failed to reblock evaluation %#v: %v",
   487  			eval, err)
   488  		if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) {
   489  			goto SUBMIT
   490  		}
   491  		return err
   492  	} else {
   493  		w.logger.Printf("[DEBUG] worker: reblocked evaluation %#v", eval)
   494  		w.backoffReset()
   495  	}
   496  	return nil
   497  }
   498  
   499  // shouldResubmit checks if a given error should be swallowed and the plan
   500  // resubmitted after a backoff. Usually these are transient errors that
   501  // the cluster should heal from quickly.
   502  func (w *Worker) shouldResubmit(err error) bool {
   503  	s := err.Error()
   504  	switch {
   505  	case strings.Contains(s, "No cluster leader"):
   506  		return true
   507  	case strings.Contains(s, "plan queue is disabled"):
   508  		return true
   509  	default:
   510  		return false
   511  	}
   512  }
   513  
   514  // backoffErr is used to do an exponential back off on error. This is
   515  // maintained statefully for the worker. Returns if attempts should be
   516  // abandoneded due to shutdown.
   517  func (w *Worker) backoffErr(base, limit time.Duration) bool {
   518  	backoff := (1 << (2 * w.failures)) * base
   519  	if backoff > limit {
   520  		backoff = limit
   521  	} else {
   522  		w.failures++
   523  	}
   524  	select {
   525  	case <-time.After(backoff):
   526  		return false
   527  	case <-w.srv.shutdownCh:
   528  		return true
   529  	}
   530  }
   531  
   532  // backoffReset is used to reset the failure count for
   533  // exponential backoff
   534  func (w *Worker) backoffReset() {
   535  	w.failures = 0
   536  }