github.com/ryanslade/nomad@v0.2.4-0.20160128061903-fc95782f2089/nomad/eval_broker.go (about)

     1  package nomad
     2  
     3  import (
     4  	"container/heap"
     5  	"errors"
     6  	"fmt"
     7  	"math/rand"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/armon/go-metrics"
    12  	"github.com/hashicorp/nomad/nomad/structs"
    13  )
    14  
    15  const (
    16  	// failedQueue is the queue we add Evaluations to once
    17  	// they've reached the deliveryLimit. This allows the leader to
    18  	// set the status to failed.
    19  	failedQueue = "_failed"
    20  )
    21  
    22  var (
    23  	// ErrNotOutstanding is returned if an evaluation is not outstanding
    24  	ErrNotOutstanding = errors.New("evaluation is not outstanding")
    25  
    26  	// ErrTokenMismatch is the outstanding eval has a different token
    27  	ErrTokenMismatch = errors.New("evaluation token does not match")
    28  
    29  	// ErrNackTimeoutReached is returned if an expired evaluation is reset
    30  	ErrNackTimeoutReached = errors.New("evaluation nack timeout reached")
    31  )
    32  
    33  // EvalBroker is used to manage brokering of evaluations. When an evaluation is
    34  // created, due to a change in a job specification or a node, we put it into the
    35  // broker. The broker sorts by evaluations by priority and scheduler type. This
    36  // allows us to dequeue the highest priority work first, while also allowing sub-schedulers
    37  // to only dequeue work they know how to handle. The broker is designed to be entirely
    38  // in-memory and is managed by the leader node.
    39  //
    40  // The broker must provide at-least-once delivery semantics. It relies on explicit
    41  // Ack/Nack messages to handle this. If a delivery is not Ack'd in a sufficient time
    42  // span, it will be assumed Nack'd.
    43  type EvalBroker struct {
    44  	nackTimeout   time.Duration
    45  	deliveryLimit int
    46  
    47  	enabled bool
    48  	stats   *BrokerStats
    49  
    50  	// evals tracks queued evaluations by ID to de-duplicate enqueue.
    51  	// The counter is the number of times we've attempted delivery,
    52  	// and is used to eventually fail an evaluation.
    53  	evals map[string]int
    54  
    55  	// jobEvals tracks queued evaluations by JobID to serialize them
    56  	jobEvals map[string]string
    57  
    58  	// blocked tracks the blocked evaluations by JobID in a priority queue
    59  	blocked map[string]PendingEvaluations
    60  
    61  	// ready tracks the ready jobs by scheduler in a priority queue
    62  	ready map[string]PendingEvaluations
    63  
    64  	// unack is a map of evalID to an un-acknowledged evaluation
    65  	unack map[string]*unackEval
    66  
    67  	// waiting is used to notify on a per-scheduler basis of ready work
    68  	waiting map[string]chan struct{}
    69  
    70  	// timeWait has evaluations that are waiting for time to elapse
    71  	timeWait map[string]*time.Timer
    72  
    73  	l sync.RWMutex
    74  }
    75  
    76  // unackEval tracks an unacknowledged evaluation along with the Nack timer
    77  type unackEval struct {
    78  	Eval      *structs.Evaluation
    79  	Token     string
    80  	NackTimer *time.Timer
    81  }
    82  
    83  // PendingEvaluations is a list of waiting evaluations.
    84  // We implement the container/heap interface so that this is a
    85  // priority queue
    86  type PendingEvaluations []*structs.Evaluation
    87  
    88  // NewEvalBroker creates a new evaluation broker. This is parameterized
    89  // with the timeout used for messages that are not acknowledged before we
    90  // assume a Nack and attempt to redeliver as well as the deliveryLimit
    91  // which prevents a failing eval from being endlessly delivered.
    92  func NewEvalBroker(timeout time.Duration, deliveryLimit int) (*EvalBroker, error) {
    93  	if timeout < 0 {
    94  		return nil, fmt.Errorf("timeout cannot be negative")
    95  	}
    96  	b := &EvalBroker{
    97  		nackTimeout:   timeout,
    98  		deliveryLimit: deliveryLimit,
    99  		enabled:       false,
   100  		stats:         new(BrokerStats),
   101  		evals:         make(map[string]int),
   102  		jobEvals:      make(map[string]string),
   103  		blocked:       make(map[string]PendingEvaluations),
   104  		ready:         make(map[string]PendingEvaluations),
   105  		unack:         make(map[string]*unackEval),
   106  		waiting:       make(map[string]chan struct{}),
   107  		timeWait:      make(map[string]*time.Timer),
   108  	}
   109  	b.stats.ByScheduler = make(map[string]*SchedulerStats)
   110  	return b, nil
   111  }
   112  
   113  // Enabled is used to check if the broker is enabled.
   114  func (b *EvalBroker) Enabled() bool {
   115  	b.l.RLock()
   116  	defer b.l.RUnlock()
   117  	return b.enabled
   118  }
   119  
   120  // SetEnabled is used to control if the broker is enabled. The broker
   121  // should only be enabled on the active leader.
   122  func (b *EvalBroker) SetEnabled(enabled bool) {
   123  	b.l.Lock()
   124  	b.enabled = enabled
   125  	b.l.Unlock()
   126  	if !enabled {
   127  		b.Flush()
   128  	}
   129  }
   130  
   131  // Enqueue is used to enqueue an evaluation
   132  func (b *EvalBroker) Enqueue(eval *structs.Evaluation) error {
   133  	b.l.Lock()
   134  	defer b.l.Unlock()
   135  
   136  	// Check if already enqueued
   137  	if _, ok := b.evals[eval.ID]; ok {
   138  		return nil
   139  	} else if b.enabled {
   140  		b.evals[eval.ID] = 0
   141  	}
   142  
   143  	// Check if we need to enforce a wait
   144  	if eval.Wait > 0 {
   145  		timer := time.AfterFunc(eval.Wait, func() {
   146  			b.enqueueWaiting(eval)
   147  		})
   148  		b.timeWait[eval.ID] = timer
   149  		b.stats.TotalWaiting += 1
   150  		return nil
   151  	}
   152  
   153  	b.enqueueLocked(eval, eval.Type)
   154  	return nil
   155  }
   156  
   157  // enqueueWaiting is used to enqueue a waiting evaluation
   158  func (b *EvalBroker) enqueueWaiting(eval *structs.Evaluation) {
   159  	b.l.Lock()
   160  	defer b.l.Unlock()
   161  	delete(b.timeWait, eval.ID)
   162  	b.stats.TotalWaiting -= 1
   163  	b.enqueueLocked(eval, eval.Type)
   164  }
   165  
   166  // enqueueLocked is used to enqueue with the lock held
   167  func (b *EvalBroker) enqueueLocked(eval *structs.Evaluation, queue string) {
   168  	// Do nothing if not enabled
   169  	if !b.enabled {
   170  		return
   171  	}
   172  
   173  	// Check if there is an evaluation for this JobID pending
   174  	pendingEval := b.jobEvals[eval.JobID]
   175  	if pendingEval == "" {
   176  		b.jobEvals[eval.JobID] = eval.ID
   177  	} else if pendingEval != eval.ID {
   178  		blocked := b.blocked[eval.JobID]
   179  		heap.Push(&blocked, eval)
   180  		b.blocked[eval.JobID] = blocked
   181  		b.stats.TotalBlocked += 1
   182  		return
   183  	}
   184  
   185  	// Find the pending by scheduler class
   186  	pending, ok := b.ready[queue]
   187  	if !ok {
   188  		pending = make([]*structs.Evaluation, 0, 16)
   189  		if _, ok := b.waiting[queue]; !ok {
   190  			b.waiting[queue] = make(chan struct{}, 1)
   191  		}
   192  	}
   193  
   194  	// Push onto the heap
   195  	heap.Push(&pending, eval)
   196  	b.ready[queue] = pending
   197  
   198  	// Update the stats
   199  	b.stats.TotalReady += 1
   200  	bySched, ok := b.stats.ByScheduler[queue]
   201  	if !ok {
   202  		bySched = &SchedulerStats{}
   203  		b.stats.ByScheduler[queue] = bySched
   204  	}
   205  	bySched.Ready += 1
   206  
   207  	// Unblock any blocked dequeues
   208  	select {
   209  	case b.waiting[queue] <- struct{}{}:
   210  	default:
   211  	}
   212  }
   213  
   214  // Dequeue is used to perform a blocking dequeue
   215  func (b *EvalBroker) Dequeue(schedulers []string, timeout time.Duration) (*structs.Evaluation, string, error) {
   216  	var timeoutTimer *time.Timer
   217  	var timeoutCh <-chan time.Time
   218  SCAN:
   219  	// Scan for work
   220  	eval, token, err := b.scanForSchedulers(schedulers)
   221  	if err != nil {
   222  		if timeoutTimer != nil {
   223  			timeoutTimer.Stop()
   224  		}
   225  		return nil, "", err
   226  	}
   227  
   228  	// Check if we have something
   229  	if eval != nil {
   230  		if timeoutTimer != nil {
   231  			timeoutTimer.Stop()
   232  		}
   233  		return eval, token, nil
   234  	}
   235  
   236  	// Setup the timeout channel the first time around
   237  	if timeoutTimer == nil && timeout != 0 {
   238  		timeoutTimer = time.NewTimer(timeout)
   239  		timeoutCh = timeoutTimer.C
   240  	}
   241  
   242  	// Block until we get work
   243  	scan := b.waitForSchedulers(schedulers, timeoutCh)
   244  	if scan {
   245  		goto SCAN
   246  	}
   247  	return nil, "", nil
   248  }
   249  
   250  // scanForSchedulers scans for work on any of the schedulers. The highest priority work
   251  // is dequeued first. This may return nothing if there is no work waiting.
   252  func (b *EvalBroker) scanForSchedulers(schedulers []string) (*structs.Evaluation, string, error) {
   253  	b.l.Lock()
   254  	defer b.l.Unlock()
   255  
   256  	// Do nothing if not enabled
   257  	if !b.enabled {
   258  		return nil, "", fmt.Errorf("eval broker disabled")
   259  	}
   260  
   261  	// Scan for eligible work
   262  	var eligibleSched []string
   263  	var eligiblePriority int
   264  	for _, sched := range schedulers {
   265  		// Get the pending queue
   266  		pending, ok := b.ready[sched]
   267  		if !ok {
   268  			continue
   269  		}
   270  
   271  		// Peek at the next item
   272  		ready := pending.Peek()
   273  		if ready == nil {
   274  			continue
   275  		}
   276  
   277  		// Add to eligible if equal or greater priority
   278  		if len(eligibleSched) == 0 || ready.Priority > eligiblePriority {
   279  			eligibleSched = []string{sched}
   280  			eligiblePriority = ready.Priority
   281  
   282  		} else if eligiblePriority > ready.Priority {
   283  			continue
   284  
   285  		} else if eligiblePriority == ready.Priority {
   286  			eligibleSched = append(eligibleSched, sched)
   287  		}
   288  	}
   289  
   290  	// Determine behavior based on eligible work
   291  	switch n := len(eligibleSched); n {
   292  	case 0:
   293  		// No work to do!
   294  		return nil, "", nil
   295  
   296  	case 1:
   297  		// Only a single task, dequeue
   298  		return b.dequeueForSched(eligibleSched[0])
   299  
   300  	default:
   301  		// Multiple tasks. We pick a random task so that we fairly
   302  		// distribute work.
   303  		offset := rand.Int63() % int64(n)
   304  		return b.dequeueForSched(eligibleSched[offset])
   305  	}
   306  }
   307  
   308  // dequeueForSched is used to dequeue the next work item for a given scheduler.
   309  // This assumes locks are held and that this scheduler has work
   310  func (b *EvalBroker) dequeueForSched(sched string) (*structs.Evaluation, string, error) {
   311  	// Get the pending queue
   312  	pending := b.ready[sched]
   313  	raw := heap.Pop(&pending)
   314  	b.ready[sched] = pending
   315  	eval := raw.(*structs.Evaluation)
   316  
   317  	// Generate a UUID for the token
   318  	token := structs.GenerateUUID()
   319  
   320  	// Setup Nack timer
   321  	nackTimer := time.AfterFunc(b.nackTimeout, func() {
   322  		b.Nack(eval.ID, token)
   323  	})
   324  
   325  	// Add to the unack queue
   326  	b.unack[eval.ID] = &unackEval{
   327  		Eval:      eval,
   328  		Token:     token,
   329  		NackTimer: nackTimer,
   330  	}
   331  
   332  	// Increment the dequeue count
   333  	b.evals[eval.ID] += 1
   334  
   335  	// Update the stats
   336  	b.stats.TotalReady -= 1
   337  	b.stats.TotalUnacked += 1
   338  	bySched := b.stats.ByScheduler[sched]
   339  	bySched.Ready -= 1
   340  	bySched.Unacked += 1
   341  
   342  	return eval, token, nil
   343  }
   344  
   345  // waitForSchedulers is used to wait for work on any of the scheduler or until a timeout.
   346  // Returns if there is work waiting potentially.
   347  func (b *EvalBroker) waitForSchedulers(schedulers []string, timeoutCh <-chan time.Time) bool {
   348  	doneCh := make(chan struct{})
   349  	readyCh := make(chan struct{}, 1)
   350  	defer close(doneCh)
   351  
   352  	// Start all the watchers
   353  	b.l.Lock()
   354  	for _, sched := range schedulers {
   355  		waitCh, ok := b.waiting[sched]
   356  		if !ok {
   357  			waitCh = make(chan struct{}, 1)
   358  			b.waiting[sched] = waitCh
   359  		}
   360  
   361  		// Start a goroutine that either waits for the waitCh on this scheduler
   362  		// to unblock or for this waitForSchedulers call to return
   363  		go func() {
   364  			select {
   365  			case <-waitCh:
   366  				select {
   367  				case readyCh <- struct{}{}:
   368  				default:
   369  				}
   370  			case <-doneCh:
   371  			}
   372  		}()
   373  	}
   374  	b.l.Unlock()
   375  
   376  	// Block until we have ready work and should scan, or until we timeout
   377  	// and should not make an attempt to scan for work
   378  	select {
   379  	case <-readyCh:
   380  		return true
   381  	case <-timeoutCh:
   382  		return false
   383  	}
   384  }
   385  
   386  // Outstanding checks if an EvalID has been delivered but not acknowledged
   387  // and returns the associated token for the evaluation.
   388  func (b *EvalBroker) Outstanding(evalID string) (string, bool) {
   389  	b.l.RLock()
   390  	defer b.l.RUnlock()
   391  	unack, ok := b.unack[evalID]
   392  	if !ok {
   393  		return "", false
   394  	}
   395  	return unack.Token, true
   396  }
   397  
   398  // OutstandingReset resets the Nack timer for the EvalID if the
   399  // token matches and the eval is outstanding
   400  func (b *EvalBroker) OutstandingReset(evalID, token string) error {
   401  	b.l.RLock()
   402  	defer b.l.RUnlock()
   403  	unack, ok := b.unack[evalID]
   404  	if !ok {
   405  		return ErrNotOutstanding
   406  	}
   407  	if unack.Token != token {
   408  		return ErrTokenMismatch
   409  	}
   410  	if !unack.NackTimer.Reset(b.nackTimeout) {
   411  		return ErrNackTimeoutReached
   412  	}
   413  	return nil
   414  }
   415  
   416  // Ack is used to positively acknowledge handling an evaluation
   417  func (b *EvalBroker) Ack(evalID, token string) error {
   418  	b.l.Lock()
   419  	defer b.l.Unlock()
   420  
   421  	// Lookup the unack'd eval
   422  	unack, ok := b.unack[evalID]
   423  	if !ok {
   424  		return fmt.Errorf("Evaluation ID not found")
   425  	}
   426  	if unack.Token != token {
   427  		return fmt.Errorf("Token does not match for Evaluation ID")
   428  	}
   429  	jobID := unack.Eval.JobID
   430  
   431  	// Ensure we were able to stop the timer
   432  	if !unack.NackTimer.Stop() {
   433  		return fmt.Errorf("Evaluation ID Ack'd after Nack timer expiration")
   434  	}
   435  
   436  	// Update the stats
   437  	b.stats.TotalUnacked -= 1
   438  	queue := unack.Eval.Type
   439  	if b.evals[evalID] >= b.deliveryLimit {
   440  		queue = failedQueue
   441  	}
   442  	bySched := b.stats.ByScheduler[queue]
   443  	bySched.Unacked -= 1
   444  
   445  	// Cleanup
   446  	delete(b.unack, evalID)
   447  	delete(b.evals, evalID)
   448  	delete(b.jobEvals, jobID)
   449  
   450  	// Check if there are any blocked evaluations
   451  	if blocked := b.blocked[jobID]; len(blocked) != 0 {
   452  		raw := heap.Pop(&blocked)
   453  		if len(blocked) > 0 {
   454  			b.blocked[jobID] = blocked
   455  		} else {
   456  			delete(b.blocked, jobID)
   457  		}
   458  		eval := raw.(*structs.Evaluation)
   459  		b.stats.TotalBlocked -= 1
   460  		b.enqueueLocked(eval, eval.Type)
   461  		return nil
   462  	}
   463  	return nil
   464  }
   465  
   466  // Nack is used to negatively acknowledge handling an evaluation
   467  func (b *EvalBroker) Nack(evalID, token string) error {
   468  	b.l.Lock()
   469  	defer b.l.Unlock()
   470  
   471  	// Lookup the unack'd eval
   472  	unack, ok := b.unack[evalID]
   473  	if !ok {
   474  		return fmt.Errorf("Evaluation ID not found")
   475  	}
   476  	if unack.Token != token {
   477  		return fmt.Errorf("Token does not match for Evaluation ID")
   478  	}
   479  
   480  	// Stop the timer, doesn't matter if we've missed it
   481  	unack.NackTimer.Stop()
   482  
   483  	// Cleanup
   484  	delete(b.unack, evalID)
   485  
   486  	// Update the stats
   487  	b.stats.TotalUnacked -= 1
   488  	bySched := b.stats.ByScheduler[unack.Eval.Type]
   489  	bySched.Unacked -= 1
   490  
   491  	// Check if we've hit the delivery limit, and re-enqueue
   492  	// in the failedQueue
   493  	if b.evals[evalID] >= b.deliveryLimit {
   494  		b.enqueueLocked(unack.Eval, failedQueue)
   495  	} else {
   496  		b.enqueueLocked(unack.Eval, unack.Eval.Type)
   497  	}
   498  	return nil
   499  }
   500  
   501  // Flush is used to clear the state of the broker
   502  func (b *EvalBroker) Flush() {
   503  	b.l.Lock()
   504  	defer b.l.Unlock()
   505  
   506  	// Unblock any waiters
   507  	for _, waitCh := range b.waiting {
   508  		close(waitCh)
   509  	}
   510  	b.waiting = make(map[string]chan struct{})
   511  
   512  	// Cancel any Nack timers
   513  	for _, unack := range b.unack {
   514  		unack.NackTimer.Stop()
   515  	}
   516  
   517  	// Cancel any time wait evals
   518  	for _, wait := range b.timeWait {
   519  		wait.Stop()
   520  	}
   521  
   522  	// Reset the broker
   523  	b.stats.TotalReady = 0
   524  	b.stats.TotalUnacked = 0
   525  	b.stats.TotalBlocked = 0
   526  	b.stats.TotalWaiting = 0
   527  	b.stats.ByScheduler = make(map[string]*SchedulerStats)
   528  	b.evals = make(map[string]int)
   529  	b.jobEvals = make(map[string]string)
   530  	b.blocked = make(map[string]PendingEvaluations)
   531  	b.ready = make(map[string]PendingEvaluations)
   532  	b.unack = make(map[string]*unackEval)
   533  	b.timeWait = make(map[string]*time.Timer)
   534  }
   535  
   536  // Stats is used to query the state of the broker
   537  func (b *EvalBroker) Stats() *BrokerStats {
   538  	// Allocate a new stats struct
   539  	stats := new(BrokerStats)
   540  	stats.ByScheduler = make(map[string]*SchedulerStats)
   541  
   542  	b.l.RLock()
   543  	defer b.l.RUnlock()
   544  
   545  	// Copy all the stats
   546  	stats.TotalReady = b.stats.TotalReady
   547  	stats.TotalUnacked = b.stats.TotalUnacked
   548  	stats.TotalBlocked = b.stats.TotalBlocked
   549  	stats.TotalWaiting = b.stats.TotalWaiting
   550  	for sched, subStat := range b.stats.ByScheduler {
   551  		subStatCopy := new(SchedulerStats)
   552  		*subStatCopy = *subStat
   553  		stats.ByScheduler[sched] = subStatCopy
   554  	}
   555  	return stats
   556  }
   557  
   558  // EmitStats is used to export metrics about the broker while enabled
   559  func (b *EvalBroker) EmitStats(period time.Duration, stopCh chan struct{}) {
   560  	for {
   561  		select {
   562  		case <-time.After(period):
   563  			stats := b.Stats()
   564  			metrics.SetGauge([]string{"nomad", "broker", "total_ready"}, float32(stats.TotalReady))
   565  			metrics.SetGauge([]string{"nomad", "broker", "total_unacked"}, float32(stats.TotalUnacked))
   566  			metrics.SetGauge([]string{"nomad", "broker", "total_blocked"}, float32(stats.TotalBlocked))
   567  			metrics.SetGauge([]string{"nomad", "broker", "total_waiting"}, float32(stats.TotalWaiting))
   568  			for sched, schedStats := range stats.ByScheduler {
   569  				metrics.SetGauge([]string{"nomad", "broker", sched, "ready"}, float32(schedStats.Ready))
   570  				metrics.SetGauge([]string{"nomad", "broker", sched, "unacked"}, float32(schedStats.Unacked))
   571  			}
   572  
   573  		case <-stopCh:
   574  			return
   575  		}
   576  	}
   577  }
   578  
   579  // BrokerStats returns all the stats about the broker
   580  type BrokerStats struct {
   581  	TotalReady   int
   582  	TotalUnacked int
   583  	TotalBlocked int
   584  	TotalWaiting int
   585  	ByScheduler  map[string]*SchedulerStats
   586  }
   587  
   588  // SchedulerStats returns the stats per scheduler
   589  type SchedulerStats struct {
   590  	Ready   int
   591  	Unacked int
   592  }
   593  
   594  // Len is for the sorting interface
   595  func (p PendingEvaluations) Len() int {
   596  	return len(p)
   597  }
   598  
   599  // Less is for the sorting interface. We flip the check
   600  // so that the "min" in the min-heap is the element with the
   601  // highest priority
   602  func (p PendingEvaluations) Less(i, j int) bool {
   603  	if p[i].JobID != p[j].JobID && p[i].Priority != p[j].Priority {
   604  		return !(p[i].Priority < p[j].Priority)
   605  	}
   606  	return p[i].CreateIndex < p[j].CreateIndex
   607  }
   608  
   609  // Swap is for the sorting interface
   610  func (p PendingEvaluations) Swap(i, j int) {
   611  	p[i], p[j] = p[j], p[i]
   612  }
   613  
   614  // Push is used to add a new evalution to the slice
   615  func (p *PendingEvaluations) Push(e interface{}) {
   616  	*p = append(*p, e.(*structs.Evaluation))
   617  }
   618  
   619  // Pop is used to remove an evaluation from the slice
   620  func (p *PendingEvaluations) Pop() interface{} {
   621  	n := len(*p)
   622  	e := (*p)[n-1]
   623  	(*p)[n-1] = nil
   624  	*p = (*p)[:n-1]
   625  	return e
   626  }
   627  
   628  // Peek is used to peek at the next element that would be popped
   629  func (p PendingEvaluations) Peek() *structs.Evaluation {
   630  	n := len(p)
   631  	if n == 0 {
   632  		return nil
   633  	}
   634  	return p[n-1]
   635  }