github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/eval_broker.go (about)

     1  package nomad
     2  
     3  import (
     4  	"container/heap"
     5  	"errors"
     6  	"fmt"
     7  	"math/rand"
     8  	"sync"
     9  	"time"
    10  
    11  	"context"
    12  
    13  	"github.com/armon/go-metrics"
    14  	"github.com/hashicorp/nomad/helper/uuid"
    15  	"github.com/hashicorp/nomad/lib/delayheap"
    16  	"github.com/hashicorp/nomad/nomad/structs"
    17  )
    18  
    19  const (
    20  	// failedQueue is the queue we add Evaluations to once
    21  	// they've reached the deliveryLimit. This allows the leader to
    22  	// set the status to failed.
    23  	failedQueue = "_failed"
    24  )
    25  
    26  var (
    27  	// ErrNotOutstanding is returned if an evaluation is not outstanding
    28  	ErrNotOutstanding = errors.New("evaluation is not outstanding")
    29  
    30  	// ErrTokenMismatch is the outstanding eval has a different token
    31  	ErrTokenMismatch = errors.New("evaluation token does not match")
    32  
    33  	// ErrNackTimeoutReached is returned if an expired evaluation is reset
    34  	ErrNackTimeoutReached = errors.New("evaluation nack timeout reached")
    35  )
    36  
    37  // EvalBroker is used to manage brokering of evaluations. When an evaluation is
    38  // created, due to a change in a job specification or a node, we put it into the
    39  // broker. The broker sorts by evaluations by priority and scheduler type. This
    40  // allows us to dequeue the highest priority work first, while also allowing sub-schedulers
    41  // to only dequeue work they know how to handle. The broker is designed to be entirely
    42  // in-memory and is managed by the leader node.
    43  //
    44  // The broker must provide at-least-once delivery semantics. It relies on explicit
    45  // Ack/Nack messages to handle this. If a delivery is not Ack'd in a sufficient time
    46  // span, it will be assumed Nack'd.
    47  type EvalBroker struct {
    48  	nackTimeout   time.Duration
    49  	deliveryLimit int
    50  
    51  	enabled bool
    52  	stats   *BrokerStats
    53  
    54  	// evals tracks queued evaluations by ID to de-duplicate enqueue.
    55  	// The counter is the number of times we've attempted delivery,
    56  	// and is used to eventually fail an evaluation.
    57  	evals map[string]int
    58  
    59  	// jobEvals tracks queued evaluations by a job's ID and namespace to serialize them
    60  	jobEvals map[structs.NamespacedID]string
    61  
    62  	// blocked tracks the blocked evaluations by JobID in a priority queue
    63  	blocked map[structs.NamespacedID]PendingEvaluations
    64  
    65  	// ready tracks the ready jobs by scheduler in a priority queue
    66  	ready map[string]PendingEvaluations
    67  
    68  	// unack is a map of evalID to an un-acknowledged evaluation
    69  	unack map[string]*unackEval
    70  
    71  	// waiting is used to notify on a per-scheduler basis of ready work
    72  	waiting map[string]chan struct{}
    73  
    74  	// requeue tracks evaluations that need to be re-enqueued once the current
    75  	// evaluation finishes by token. If the token is Nacked or rejected the
    76  	// evaluation is dropped but if Acked successfully, the evaluation is
    77  	// queued.
    78  	requeue map[string]*structs.Evaluation
    79  
    80  	// timeWait has evaluations that are waiting for time to elapse
    81  	timeWait map[string]*time.Timer
    82  
    83  	// delayedEvalCancelFunc is used to stop the long running go routine
    84  	// that processes delayed evaluations
    85  	delayedEvalCancelFunc context.CancelFunc
    86  
    87  	// delayHeap is a heap used to track incoming evaluations that are
    88  	// not eligible to enqueue until their WaitTime
    89  	delayHeap *delayheap.DelayHeap
    90  
    91  	// delayedEvalsUpdateCh is used to trigger notifications for updates
    92  	// to the delayHeap
    93  	delayedEvalsUpdateCh chan struct{}
    94  
    95  	// initialNackDelay is the delay applied before re-enqueuing a
    96  	// Nacked evaluation for the first time.
    97  	initialNackDelay time.Duration
    98  
    99  	// subsequentNackDelay is the delay applied before reenqueuing
   100  	// an evaluation that has been Nacked more than once. This delay is
   101  	// compounding after the first Nack.
   102  	subsequentNackDelay time.Duration
   103  
   104  	l sync.RWMutex
   105  }
   106  
   107  // unackEval tracks an unacknowledged evaluation along with the Nack timer
   108  type unackEval struct {
   109  	Eval      *structs.Evaluation
   110  	Token     string
   111  	NackTimer *time.Timer
   112  }
   113  
   114  // PendingEvaluations is a list of waiting evaluations.
   115  // We implement the container/heap interface so that this is a
   116  // priority queue
   117  type PendingEvaluations []*structs.Evaluation
   118  
   119  // NewEvalBroker creates a new evaluation broker. This is parameterized
   120  // with the timeout used for messages that are not acknowledged before we
   121  // assume a Nack and attempt to redeliver as well as the deliveryLimit
   122  // which prevents a failing eval from being endlessly delivered. The
   123  // initialNackDelay is the delay before making a Nacked evaluation available
   124  // again for the first Nack and subsequentNackDelay is the compounding delay
   125  // after the first Nack.
   126  func NewEvalBroker(timeout, initialNackDelay, subsequentNackDelay time.Duration, deliveryLimit int) (*EvalBroker, error) {
   127  	if timeout < 0 {
   128  		return nil, fmt.Errorf("timeout cannot be negative")
   129  	}
   130  	b := &EvalBroker{
   131  		nackTimeout:          timeout,
   132  		deliveryLimit:        deliveryLimit,
   133  		enabled:              false,
   134  		stats:                new(BrokerStats),
   135  		evals:                make(map[string]int),
   136  		jobEvals:             make(map[structs.NamespacedID]string),
   137  		blocked:              make(map[structs.NamespacedID]PendingEvaluations),
   138  		ready:                make(map[string]PendingEvaluations),
   139  		unack:                make(map[string]*unackEval),
   140  		waiting:              make(map[string]chan struct{}),
   141  		requeue:              make(map[string]*structs.Evaluation),
   142  		timeWait:             make(map[string]*time.Timer),
   143  		initialNackDelay:     initialNackDelay,
   144  		subsequentNackDelay:  subsequentNackDelay,
   145  		delayHeap:            delayheap.NewDelayHeap(),
   146  		delayedEvalsUpdateCh: make(chan struct{}, 1),
   147  	}
   148  	b.stats.ByScheduler = make(map[string]*SchedulerStats)
   149  
   150  	return b, nil
   151  }
   152  
   153  // Enabled is used to check if the broker is enabled.
   154  func (b *EvalBroker) Enabled() bool {
   155  	b.l.RLock()
   156  	defer b.l.RUnlock()
   157  	return b.enabled
   158  }
   159  
   160  // SetEnabled is used to control if the broker is enabled. The broker
   161  // should only be enabled on the active leader.
   162  func (b *EvalBroker) SetEnabled(enabled bool) {
   163  	b.l.Lock()
   164  	prevEnabled := b.enabled
   165  	b.enabled = enabled
   166  	if !prevEnabled && enabled {
   167  		// start the go routine for delayed evals
   168  		ctx, cancel := context.WithCancel(context.Background())
   169  		b.delayedEvalCancelFunc = cancel
   170  		go b.runDelayedEvalsWatcher(ctx)
   171  	}
   172  	b.l.Unlock()
   173  	if !enabled {
   174  		b.flush()
   175  	}
   176  }
   177  
   178  // Enqueue is used to enqueue a new evaluation
   179  func (b *EvalBroker) Enqueue(eval *structs.Evaluation) {
   180  	b.l.Lock()
   181  	defer b.l.Unlock()
   182  	b.processEnqueue(eval, "")
   183  }
   184  
   185  // EnqueueAll is used to enqueue many evaluations. The map allows evaluations
   186  // that are being re-enqueued to include their token.
   187  //
   188  // When requeuing an evaluation that potentially may be already
   189  // enqueued. The evaluation is handled in one of the following ways:
   190  // * Evaluation not outstanding: Process as a normal Enqueue
   191  // * Evaluation outstanding: Do not allow the evaluation to be dequeued til:
   192  //    * Ack received:  Unblock the evaluation allowing it to be dequeued
   193  //    * Nack received: Drop the evaluation as it was created as a result of a
   194  //    scheduler run that was Nack'd
   195  func (b *EvalBroker) EnqueueAll(evals map[*structs.Evaluation]string) {
   196  	// The lock needs to be held until all evaluations are enqueued. This is so
   197  	// that when Dequeue operations are unblocked they will pick the highest
   198  	// priority evaluations.
   199  	b.l.Lock()
   200  	defer b.l.Unlock()
   201  	for eval, token := range evals {
   202  		b.processEnqueue(eval, token)
   203  	}
   204  }
   205  
   206  // processEnqueue deduplicates evals and either enqueue immediately or enforce
   207  // the evals wait time. If the token is passed, and the evaluation ID is
   208  // outstanding, the evaluation is blocked until an Ack/Nack is received.
   209  // processEnqueue must be called with the lock held.
   210  func (b *EvalBroker) processEnqueue(eval *structs.Evaluation, token string) {
   211  	// Check if already enqueued
   212  	if _, ok := b.evals[eval.ID]; ok {
   213  		if token == "" {
   214  			return
   215  		}
   216  
   217  		// If the token has been passed, the evaluation is being reblocked by
   218  		// the scheduler and should be processed once the outstanding evaluation
   219  		// is Acked or Nacked.
   220  		if unack, ok := b.unack[eval.ID]; ok && unack.Token == token {
   221  			b.requeue[token] = eval
   222  		}
   223  		return
   224  	} else if b.enabled {
   225  		b.evals[eval.ID] = 0
   226  	}
   227  
   228  	// Check if we need to enforce a wait
   229  	if eval.Wait > 0 {
   230  		b.processWaitingEnqueue(eval)
   231  		return
   232  	}
   233  
   234  	if !eval.WaitUntil.IsZero() {
   235  		b.delayHeap.Push(&evalWrapper{eval}, eval.WaitUntil)
   236  		b.stats.TotalWaiting += 1
   237  		// Signal an update.
   238  		select {
   239  		case b.delayedEvalsUpdateCh <- struct{}{}:
   240  		default:
   241  		}
   242  		return
   243  	}
   244  
   245  	b.enqueueLocked(eval, eval.Type)
   246  }
   247  
   248  // processWaitingEnqueue waits the given duration on the evaluation before
   249  // enqueuing.
   250  func (b *EvalBroker) processWaitingEnqueue(eval *structs.Evaluation) {
   251  	timer := time.AfterFunc(eval.Wait, func() {
   252  		b.enqueueWaiting(eval)
   253  	})
   254  	b.timeWait[eval.ID] = timer
   255  	b.stats.TotalWaiting += 1
   256  }
   257  
   258  // enqueueWaiting is used to enqueue a waiting evaluation
   259  func (b *EvalBroker) enqueueWaiting(eval *structs.Evaluation) {
   260  	b.l.Lock()
   261  	defer b.l.Unlock()
   262  	delete(b.timeWait, eval.ID)
   263  	b.stats.TotalWaiting -= 1
   264  	b.enqueueLocked(eval, eval.Type)
   265  }
   266  
   267  // enqueueLocked is used to enqueue with the lock held
   268  func (b *EvalBroker) enqueueLocked(eval *structs.Evaluation, queue string) {
   269  	// Do nothing if not enabled
   270  	if !b.enabled {
   271  		return
   272  	}
   273  
   274  	// Check if there is an evaluation for this JobID pending
   275  	namespacedID := structs.NamespacedID{
   276  		ID:        eval.JobID,
   277  		Namespace: eval.Namespace,
   278  	}
   279  	pendingEval := b.jobEvals[namespacedID]
   280  	if pendingEval == "" {
   281  		b.jobEvals[namespacedID] = eval.ID
   282  	} else if pendingEval != eval.ID {
   283  		blocked := b.blocked[namespacedID]
   284  		heap.Push(&blocked, eval)
   285  		b.blocked[namespacedID] = blocked
   286  		b.stats.TotalBlocked += 1
   287  		return
   288  	}
   289  
   290  	// Find the pending by scheduler class
   291  	pending, ok := b.ready[queue]
   292  	if !ok {
   293  		pending = make([]*structs.Evaluation, 0, 16)
   294  		if _, ok := b.waiting[queue]; !ok {
   295  			b.waiting[queue] = make(chan struct{}, 1)
   296  		}
   297  	}
   298  
   299  	// Push onto the heap
   300  	heap.Push(&pending, eval)
   301  	b.ready[queue] = pending
   302  
   303  	// Update the stats
   304  	b.stats.TotalReady += 1
   305  	bySched, ok := b.stats.ByScheduler[queue]
   306  	if !ok {
   307  		bySched = &SchedulerStats{}
   308  		b.stats.ByScheduler[queue] = bySched
   309  	}
   310  	bySched.Ready += 1
   311  
   312  	// Unblock any blocked dequeues
   313  	select {
   314  	case b.waiting[queue] <- struct{}{}:
   315  	default:
   316  	}
   317  }
   318  
   319  // Dequeue is used to perform a blocking dequeue
   320  func (b *EvalBroker) Dequeue(schedulers []string, timeout time.Duration) (*structs.Evaluation, string, error) {
   321  	var timeoutTimer *time.Timer
   322  	var timeoutCh <-chan time.Time
   323  SCAN:
   324  	// Scan for work
   325  	eval, token, err := b.scanForSchedulers(schedulers)
   326  	if err != nil {
   327  		if timeoutTimer != nil {
   328  			timeoutTimer.Stop()
   329  		}
   330  		return nil, "", err
   331  	}
   332  
   333  	// Check if we have something
   334  	if eval != nil {
   335  		if timeoutTimer != nil {
   336  			timeoutTimer.Stop()
   337  		}
   338  		return eval, token, nil
   339  	}
   340  
   341  	// Setup the timeout channel the first time around
   342  	if timeoutTimer == nil && timeout != 0 {
   343  		timeoutTimer = time.NewTimer(timeout)
   344  		timeoutCh = timeoutTimer.C
   345  	}
   346  
   347  	// Block until we get work
   348  	scan := b.waitForSchedulers(schedulers, timeoutCh)
   349  	if scan {
   350  		goto SCAN
   351  	}
   352  	return nil, "", nil
   353  }
   354  
   355  // scanForSchedulers scans for work on any of the schedulers. The highest priority work
   356  // is dequeued first. This may return nothing if there is no work waiting.
   357  func (b *EvalBroker) scanForSchedulers(schedulers []string) (*structs.Evaluation, string, error) {
   358  	b.l.Lock()
   359  	defer b.l.Unlock()
   360  
   361  	// Do nothing if not enabled
   362  	if !b.enabled {
   363  		return nil, "", fmt.Errorf("eval broker disabled")
   364  	}
   365  
   366  	// Scan for eligible work
   367  	var eligibleSched []string
   368  	var eligiblePriority int
   369  	for _, sched := range schedulers {
   370  		// Get the pending queue
   371  		pending, ok := b.ready[sched]
   372  		if !ok {
   373  			continue
   374  		}
   375  
   376  		// Peek at the next item
   377  		ready := pending.Peek()
   378  		if ready == nil {
   379  			continue
   380  		}
   381  
   382  		// Add to eligible if equal or greater priority
   383  		if len(eligibleSched) == 0 || ready.Priority > eligiblePriority {
   384  			eligibleSched = []string{sched}
   385  			eligiblePriority = ready.Priority
   386  
   387  		} else if eligiblePriority > ready.Priority {
   388  			continue
   389  
   390  		} else if eligiblePriority == ready.Priority {
   391  			eligibleSched = append(eligibleSched, sched)
   392  		}
   393  	}
   394  
   395  	// Determine behavior based on eligible work
   396  	switch n := len(eligibleSched); n {
   397  	case 0:
   398  		// No work to do!
   399  		return nil, "", nil
   400  
   401  	case 1:
   402  		// Only a single task, dequeue
   403  		return b.dequeueForSched(eligibleSched[0])
   404  
   405  	default:
   406  		// Multiple tasks. We pick a random task so that we fairly
   407  		// distribute work.
   408  		offset := rand.Intn(n)
   409  		return b.dequeueForSched(eligibleSched[offset])
   410  	}
   411  }
   412  
   413  // dequeueForSched is used to dequeue the next work item for a given scheduler.
   414  // This assumes locks are held and that this scheduler has work
   415  func (b *EvalBroker) dequeueForSched(sched string) (*structs.Evaluation, string, error) {
   416  	// Get the pending queue
   417  	pending := b.ready[sched]
   418  	raw := heap.Pop(&pending)
   419  	b.ready[sched] = pending
   420  	eval := raw.(*structs.Evaluation)
   421  
   422  	// Generate a UUID for the token
   423  	token := uuid.Generate()
   424  
   425  	// Setup Nack timer
   426  	nackTimer := time.AfterFunc(b.nackTimeout, func() {
   427  		b.Nack(eval.ID, token)
   428  	})
   429  
   430  	// Add to the unack queue
   431  	b.unack[eval.ID] = &unackEval{
   432  		Eval:      eval,
   433  		Token:     token,
   434  		NackTimer: nackTimer,
   435  	}
   436  
   437  	// Increment the dequeue count
   438  	b.evals[eval.ID] += 1
   439  
   440  	// Update the stats
   441  	b.stats.TotalReady -= 1
   442  	b.stats.TotalUnacked += 1
   443  	bySched := b.stats.ByScheduler[sched]
   444  	bySched.Ready -= 1
   445  	bySched.Unacked += 1
   446  
   447  	return eval, token, nil
   448  }
   449  
   450  // waitForSchedulers is used to wait for work on any of the scheduler or until a timeout.
   451  // Returns if there is work waiting potentially.
   452  func (b *EvalBroker) waitForSchedulers(schedulers []string, timeoutCh <-chan time.Time) bool {
   453  	doneCh := make(chan struct{})
   454  	readyCh := make(chan struct{}, 1)
   455  	defer close(doneCh)
   456  
   457  	// Start all the watchers
   458  	b.l.Lock()
   459  	for _, sched := range schedulers {
   460  		waitCh, ok := b.waiting[sched]
   461  		if !ok {
   462  			waitCh = make(chan struct{}, 1)
   463  			b.waiting[sched] = waitCh
   464  		}
   465  
   466  		// Start a goroutine that either waits for the waitCh on this scheduler
   467  		// to unblock or for this waitForSchedulers call to return
   468  		go func() {
   469  			select {
   470  			case <-waitCh:
   471  				select {
   472  				case readyCh <- struct{}{}:
   473  				default:
   474  				}
   475  			case <-doneCh:
   476  			}
   477  		}()
   478  	}
   479  	b.l.Unlock()
   480  
   481  	// Block until we have ready work and should scan, or until we timeout
   482  	// and should not make an attempt to scan for work
   483  	select {
   484  	case <-readyCh:
   485  		return true
   486  	case <-timeoutCh:
   487  		return false
   488  	}
   489  }
   490  
   491  // Outstanding checks if an EvalID has been delivered but not acknowledged
   492  // and returns the associated token for the evaluation.
   493  func (b *EvalBroker) Outstanding(evalID string) (string, bool) {
   494  	b.l.RLock()
   495  	defer b.l.RUnlock()
   496  	unack, ok := b.unack[evalID]
   497  	if !ok {
   498  		return "", false
   499  	}
   500  	return unack.Token, true
   501  }
   502  
   503  // OutstandingReset resets the Nack timer for the EvalID if the
   504  // token matches and the eval is outstanding
   505  func (b *EvalBroker) OutstandingReset(evalID, token string) error {
   506  	b.l.RLock()
   507  	defer b.l.RUnlock()
   508  	unack, ok := b.unack[evalID]
   509  	if !ok {
   510  		return ErrNotOutstanding
   511  	}
   512  	if unack.Token != token {
   513  		return ErrTokenMismatch
   514  	}
   515  	if !unack.NackTimer.Reset(b.nackTimeout) {
   516  		return ErrNackTimeoutReached
   517  	}
   518  	return nil
   519  }
   520  
   521  // Ack is used to positively acknowledge handling an evaluation
   522  func (b *EvalBroker) Ack(evalID, token string) error {
   523  	b.l.Lock()
   524  	defer b.l.Unlock()
   525  
   526  	// Always delete the requeued evaluation. Either the Ack is successful and
   527  	// we requeue it or it isn't and we want to remove it.
   528  	defer delete(b.requeue, token)
   529  
   530  	// Lookup the unack'd eval
   531  	unack, ok := b.unack[evalID]
   532  	if !ok {
   533  		return fmt.Errorf("Evaluation ID not found")
   534  	}
   535  	if unack.Token != token {
   536  		return fmt.Errorf("Token does not match for Evaluation ID")
   537  	}
   538  	jobID := unack.Eval.JobID
   539  
   540  	// Ensure we were able to stop the timer
   541  	if !unack.NackTimer.Stop() {
   542  		return fmt.Errorf("Evaluation ID Ack'd after Nack timer expiration")
   543  	}
   544  
   545  	// Update the stats
   546  	b.stats.TotalUnacked -= 1
   547  	queue := unack.Eval.Type
   548  	if b.evals[evalID] > b.deliveryLimit {
   549  		queue = failedQueue
   550  	}
   551  	bySched := b.stats.ByScheduler[queue]
   552  	bySched.Unacked -= 1
   553  
   554  	// Cleanup
   555  	delete(b.unack, evalID)
   556  	delete(b.evals, evalID)
   557  
   558  	namespacedID := structs.NamespacedID{
   559  		ID:        jobID,
   560  		Namespace: unack.Eval.Namespace,
   561  	}
   562  	delete(b.jobEvals, namespacedID)
   563  
   564  	// Check if there are any blocked evaluations
   565  	if blocked := b.blocked[namespacedID]; len(blocked) != 0 {
   566  		raw := heap.Pop(&blocked)
   567  		if len(blocked) > 0 {
   568  			b.blocked[namespacedID] = blocked
   569  		} else {
   570  			delete(b.blocked, namespacedID)
   571  		}
   572  		eval := raw.(*structs.Evaluation)
   573  		b.stats.TotalBlocked -= 1
   574  		b.enqueueLocked(eval, eval.Type)
   575  	}
   576  
   577  	// Re-enqueue the evaluation.
   578  	if eval, ok := b.requeue[token]; ok {
   579  		b.processEnqueue(eval, "")
   580  	}
   581  
   582  	return nil
   583  }
   584  
   585  // Nack is used to negatively acknowledge handling an evaluation
   586  func (b *EvalBroker) Nack(evalID, token string) error {
   587  	b.l.Lock()
   588  	defer b.l.Unlock()
   589  
   590  	// Always delete the requeued evaluation since the Nack means the requeue is
   591  	// invalid.
   592  	delete(b.requeue, token)
   593  
   594  	// Lookup the unack'd eval
   595  	unack, ok := b.unack[evalID]
   596  	if !ok {
   597  		return fmt.Errorf("Evaluation ID not found")
   598  	}
   599  	if unack.Token != token {
   600  		return fmt.Errorf("Token does not match for Evaluation ID")
   601  	}
   602  
   603  	// Stop the timer, doesn't matter if we've missed it
   604  	unack.NackTimer.Stop()
   605  
   606  	// Cleanup
   607  	delete(b.unack, evalID)
   608  
   609  	// Update the stats
   610  	b.stats.TotalUnacked -= 1
   611  	bySched := b.stats.ByScheduler[unack.Eval.Type]
   612  	bySched.Unacked -= 1
   613  
   614  	// Check if we've hit the delivery limit, and re-enqueue
   615  	// in the failedQueue
   616  	if dequeues := b.evals[evalID]; dequeues >= b.deliveryLimit {
   617  		b.enqueueLocked(unack.Eval, failedQueue)
   618  	} else {
   619  		e := unack.Eval
   620  		e.Wait = b.nackReenqueueDelay(e, dequeues)
   621  
   622  		// See if there should be a delay before re-enqueuing
   623  		if e.Wait > 0 {
   624  			b.processWaitingEnqueue(e)
   625  		} else {
   626  			b.enqueueLocked(e, e.Type)
   627  		}
   628  	}
   629  
   630  	return nil
   631  }
   632  
   633  // nackReenqueueDelay is used to determine the delay that should be applied on
   634  // the evaluation given the number of previous attempts
   635  func (b *EvalBroker) nackReenqueueDelay(eval *structs.Evaluation, prevDequeues int) time.Duration {
   636  	switch {
   637  	case prevDequeues <= 0:
   638  		return 0
   639  	case prevDequeues == 1:
   640  		return b.initialNackDelay
   641  	default:
   642  		// For each subsequent nack compound a delay
   643  		return time.Duration(prevDequeues-1) * b.subsequentNackDelay
   644  	}
   645  }
   646  
   647  // PauseNackTimeout is used to pause the Nack timeout for an eval that is making
   648  // progress but is in a potentially unbounded operation such as the plan queue.
   649  func (b *EvalBroker) PauseNackTimeout(evalID, token string) error {
   650  	b.l.RLock()
   651  	defer b.l.RUnlock()
   652  	unack, ok := b.unack[evalID]
   653  	if !ok {
   654  		return ErrNotOutstanding
   655  	}
   656  	if unack.Token != token {
   657  		return ErrTokenMismatch
   658  	}
   659  	if !unack.NackTimer.Stop() {
   660  		return ErrNackTimeoutReached
   661  	}
   662  	return nil
   663  }
   664  
   665  // ResumeNackTimeout is used to resume the Nack timeout for an eval that was
   666  // paused. It should be resumed after leaving an unbounded operation.
   667  func (b *EvalBroker) ResumeNackTimeout(evalID, token string) error {
   668  	b.l.Lock()
   669  	defer b.l.Unlock()
   670  	unack, ok := b.unack[evalID]
   671  	if !ok {
   672  		return ErrNotOutstanding
   673  	}
   674  	if unack.Token != token {
   675  		return ErrTokenMismatch
   676  	}
   677  	unack.NackTimer.Reset(b.nackTimeout)
   678  	return nil
   679  }
   680  
   681  // Flush is used to clear the state of the broker
   682  func (b *EvalBroker) flush() {
   683  	b.l.Lock()
   684  	defer b.l.Unlock()
   685  
   686  	// Unblock any waiters
   687  	for _, waitCh := range b.waiting {
   688  		close(waitCh)
   689  	}
   690  	b.waiting = make(map[string]chan struct{})
   691  
   692  	// Cancel any Nack timers
   693  	for _, unack := range b.unack {
   694  		unack.NackTimer.Stop()
   695  	}
   696  
   697  	// Cancel any time wait evals
   698  	for _, wait := range b.timeWait {
   699  		wait.Stop()
   700  	}
   701  
   702  	// Cancel the delayed evaluations goroutine
   703  	if b.delayedEvalCancelFunc != nil {
   704  		b.delayedEvalCancelFunc()
   705  	}
   706  
   707  	// Clear out the update channel for delayed evaluations
   708  	b.delayedEvalsUpdateCh = make(chan struct{}, 1)
   709  
   710  	// Reset the broker
   711  	b.stats.TotalReady = 0
   712  	b.stats.TotalUnacked = 0
   713  	b.stats.TotalBlocked = 0
   714  	b.stats.TotalWaiting = 0
   715  	b.stats.ByScheduler = make(map[string]*SchedulerStats)
   716  	b.evals = make(map[string]int)
   717  	b.jobEvals = make(map[structs.NamespacedID]string)
   718  	b.blocked = make(map[structs.NamespacedID]PendingEvaluations)
   719  	b.ready = make(map[string]PendingEvaluations)
   720  	b.unack = make(map[string]*unackEval)
   721  	b.timeWait = make(map[string]*time.Timer)
   722  	b.delayHeap = delayheap.NewDelayHeap()
   723  }
   724  
   725  // evalWrapper satisfies the HeapNode interface
   726  type evalWrapper struct {
   727  	eval *structs.Evaluation
   728  }
   729  
   730  func (d *evalWrapper) Data() interface{} {
   731  	return d.eval
   732  }
   733  
   734  func (d *evalWrapper) ID() string {
   735  	return d.eval.ID
   736  }
   737  
   738  func (d *evalWrapper) Namespace() string {
   739  	return d.eval.Namespace
   740  }
   741  
   742  // runDelayedEvalsWatcher is a long-lived function that waits till a time deadline is met for
   743  // pending evaluations before enqueuing them
   744  func (b *EvalBroker) runDelayedEvalsWatcher(ctx context.Context) {
   745  	var timerChannel <-chan time.Time
   746  	var delayTimer *time.Timer
   747  	for {
   748  		eval, waitUntil := b.nextDelayedEval()
   749  		if waitUntil.IsZero() {
   750  			timerChannel = nil
   751  		} else {
   752  			launchDur := waitUntil.Sub(time.Now().UTC())
   753  			if delayTimer == nil {
   754  				delayTimer = time.NewTimer(launchDur)
   755  			} else {
   756  				delayTimer.Reset(launchDur)
   757  			}
   758  			timerChannel = delayTimer.C
   759  		}
   760  
   761  		select {
   762  		case <-ctx.Done():
   763  			return
   764  		case <-timerChannel:
   765  			// remove from the heap since we can enqueue it now
   766  			b.l.Lock()
   767  			b.delayHeap.Remove(&evalWrapper{eval})
   768  			b.stats.TotalWaiting -= 1
   769  			b.enqueueLocked(eval, eval.Type)
   770  			b.l.Unlock()
   771  		case <-b.delayedEvalsUpdateCh:
   772  			continue
   773  		}
   774  	}
   775  }
   776  
   777  // nextDelayedEval returns the next delayed eval to launch and when it should be enqueued.
   778  // This peeks at the heap to return the top. If the heap is empty, this returns nil and zero time.
   779  func (b *EvalBroker) nextDelayedEval() (*structs.Evaluation, time.Time) {
   780  	b.l.RLock()
   781  	// If there is nothing wait for an update.
   782  	if b.delayHeap.Length() == 0 {
   783  		b.l.RUnlock()
   784  		return nil, time.Time{}
   785  	}
   786  	nextEval := b.delayHeap.Peek()
   787  	b.l.RUnlock()
   788  	if nextEval == nil {
   789  		return nil, time.Time{}
   790  	}
   791  	eval := nextEval.Node.Data().(*structs.Evaluation)
   792  	return eval, nextEval.WaitUntil
   793  }
   794  
   795  // Stats is used to query the state of the broker
   796  func (b *EvalBroker) Stats() *BrokerStats {
   797  	// Allocate a new stats struct
   798  	stats := new(BrokerStats)
   799  	stats.ByScheduler = make(map[string]*SchedulerStats)
   800  
   801  	b.l.RLock()
   802  	defer b.l.RUnlock()
   803  
   804  	// Copy all the stats
   805  	stats.TotalReady = b.stats.TotalReady
   806  	stats.TotalUnacked = b.stats.TotalUnacked
   807  	stats.TotalBlocked = b.stats.TotalBlocked
   808  	stats.TotalWaiting = b.stats.TotalWaiting
   809  	for sched, subStat := range b.stats.ByScheduler {
   810  		subStatCopy := new(SchedulerStats)
   811  		*subStatCopy = *subStat
   812  		stats.ByScheduler[sched] = subStatCopy
   813  	}
   814  	return stats
   815  }
   816  
   817  // EmitStats is used to export metrics about the broker while enabled
   818  func (b *EvalBroker) EmitStats(period time.Duration, stopCh chan struct{}) {
   819  	for {
   820  		select {
   821  		case <-time.After(period):
   822  			stats := b.Stats()
   823  			metrics.SetGauge([]string{"nomad", "broker", "total_ready"}, float32(stats.TotalReady))
   824  			metrics.SetGauge([]string{"nomad", "broker", "total_unacked"}, float32(stats.TotalUnacked))
   825  			metrics.SetGauge([]string{"nomad", "broker", "total_blocked"}, float32(stats.TotalBlocked))
   826  			metrics.SetGauge([]string{"nomad", "broker", "total_waiting"}, float32(stats.TotalWaiting))
   827  			for sched, schedStats := range stats.ByScheduler {
   828  				metrics.SetGauge([]string{"nomad", "broker", sched, "ready"}, float32(schedStats.Ready))
   829  				metrics.SetGauge([]string{"nomad", "broker", sched, "unacked"}, float32(schedStats.Unacked))
   830  			}
   831  
   832  		case <-stopCh:
   833  			return
   834  		}
   835  	}
   836  }
   837  
   838  // BrokerStats returns all the stats about the broker
   839  type BrokerStats struct {
   840  	TotalReady   int
   841  	TotalUnacked int
   842  	TotalBlocked int
   843  	TotalWaiting int
   844  	ByScheduler  map[string]*SchedulerStats
   845  }
   846  
   847  // SchedulerStats returns the stats per scheduler
   848  type SchedulerStats struct {
   849  	Ready   int
   850  	Unacked int
   851  }
   852  
   853  // Len is for the sorting interface
   854  func (p PendingEvaluations) Len() int {
   855  	return len(p)
   856  }
   857  
   858  // Less is for the sorting interface. We flip the check
   859  // so that the "min" in the min-heap is the element with the
   860  // highest priority
   861  func (p PendingEvaluations) Less(i, j int) bool {
   862  	if p[i].JobID != p[j].JobID && p[i].Priority != p[j].Priority {
   863  		return !(p[i].Priority < p[j].Priority)
   864  	}
   865  	return p[i].CreateIndex < p[j].CreateIndex
   866  }
   867  
   868  // Swap is for the sorting interface
   869  func (p PendingEvaluations) Swap(i, j int) {
   870  	p[i], p[j] = p[j], p[i]
   871  }
   872  
   873  // Push is used to add a new evaluation to the slice
   874  func (p *PendingEvaluations) Push(e interface{}) {
   875  	*p = append(*p, e.(*structs.Evaluation))
   876  }
   877  
   878  // Pop is used to remove an evaluation from the slice
   879  func (p *PendingEvaluations) Pop() interface{} {
   880  	n := len(*p)
   881  	e := (*p)[n-1]
   882  	(*p)[n-1] = nil
   883  	*p = (*p)[:n-1]
   884  	return e
   885  }
   886  
   887  // Peek is used to peek at the next element that would be popped
   888  func (p PendingEvaluations) Peek() *structs.Evaluation {
   889  	n := len(p)
   890  	if n == 0 {
   891  		return nil
   892  	}
   893  	return p[n-1]
   894  }