github.com/kardianos/nomad@v0.1.3-0.20151022182107-b13df73ee850/nomad/eval_broker.go (about)

     1  package nomad
     2  
     3  import (
     4  	"container/heap"
     5  	"fmt"
     6  	"math/rand"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/armon/go-metrics"
    11  	"github.com/hashicorp/nomad/nomad/structs"
    12  )
    13  
    14  const (
    15  	// failedQueue is the queue we add Evaluations to once
    16  	// they've reached the deliveryLimit. This allows the leader to
    17  	// set the status to failed.
    18  	failedQueue = "_failed"
    19  )
    20  
    21  // EvalBroker is used to manage brokering of evaluations. When an evaluation is
    22  // created, due to a change in a job specification or a node, we put it into the
    23  // broker. The broker sorts by evaluations by priority and scheduler type. This
    24  // allows us to dequeue the highest priority work first, while also allowing sub-schedulers
    25  // to only dequeue work they know how to handle. The broker is designed to be entirely
    26  // in-memory and is managed by the leader node.
    27  //
    28  // The broker must provide at-least-once delivery semantics. It relies on explicit
    29  // Ack/Nack messages to handle this. If a delivery is not Ack'd in a sufficient time
    30  // span, it will be assumed Nack'd.
    31  type EvalBroker struct {
    32  	nackTimeout   time.Duration
    33  	deliveryLimit int
    34  
    35  	enabled bool
    36  	stats   *BrokerStats
    37  
    38  	// evals tracks queued evaluations by ID to de-duplicate enqueue.
    39  	// The counter is the number of times we've attempted delivery,
    40  	// and is used to eventually fail an evaluation.
    41  	evals map[string]int
    42  
    43  	// jobEvals tracks queued evaluations by JobID to serialize them
    44  	jobEvals map[string]string
    45  
    46  	// blocked tracks the blocked evaluations by JobID in a priority queue
    47  	blocked map[string]PendingEvaluations
    48  
    49  	// ready tracks the ready jobs by scheduler in a priority queue
    50  	ready map[string]PendingEvaluations
    51  
    52  	// unack is a map of evalID to an un-acknowledged evaluation
    53  	unack map[string]*unackEval
    54  
    55  	// waiting is used to notify on a per-scheduler basis of ready work
    56  	waiting map[string]chan struct{}
    57  
    58  	// timeWait has evaluations that are waiting for time to elapse
    59  	timeWait map[string]*time.Timer
    60  
    61  	l sync.RWMutex
    62  }
    63  
    64  // unackEval tracks an unacknowledged evaluation along with the Nack timer
    65  type unackEval struct {
    66  	Eval      *structs.Evaluation
    67  	Token     string
    68  	NackTimer *time.Timer
    69  }
    70  
    71  // PendingEvaluations is a list of waiting evaluations.
    72  // We implement the container/heap interface so that this is a
    73  // priority queue
    74  type PendingEvaluations []*structs.Evaluation
    75  
    76  // NewEvalBroker creates a new evaluation broker. This is parameterized
    77  // with the timeout used for messages that are not acknowledged before we
    78  // assume a Nack and attempt to redeliver as well as the deliveryLimit
    79  // which prevents a failing eval from being endlessly delivered.
    80  func NewEvalBroker(timeout time.Duration, deliveryLimit int) (*EvalBroker, error) {
    81  	if timeout < 0 {
    82  		return nil, fmt.Errorf("timeout cannot be negative")
    83  	}
    84  	b := &EvalBroker{
    85  		nackTimeout:   timeout,
    86  		deliveryLimit: deliveryLimit,
    87  		enabled:       false,
    88  		stats:         new(BrokerStats),
    89  		evals:         make(map[string]int),
    90  		jobEvals:      make(map[string]string),
    91  		blocked:       make(map[string]PendingEvaluations),
    92  		ready:         make(map[string]PendingEvaluations),
    93  		unack:         make(map[string]*unackEval),
    94  		waiting:       make(map[string]chan struct{}),
    95  		timeWait:      make(map[string]*time.Timer),
    96  	}
    97  	b.stats.ByScheduler = make(map[string]*SchedulerStats)
    98  	return b, nil
    99  }
   100  
   101  // Enabled is used to check if the broker is enabled.
   102  func (b *EvalBroker) Enabled() bool {
   103  	b.l.RLock()
   104  	defer b.l.RUnlock()
   105  	return b.enabled
   106  }
   107  
   108  // SetEnabled is used to control if the broker is enabled. The broker
   109  // should only be enabled on the active leader.
   110  func (b *EvalBroker) SetEnabled(enabled bool) {
   111  	b.l.Lock()
   112  	b.enabled = enabled
   113  	b.l.Unlock()
   114  	if !enabled {
   115  		b.Flush()
   116  	}
   117  }
   118  
   119  // Enqueue is used to enqueue an evaluation
   120  func (b *EvalBroker) Enqueue(eval *structs.Evaluation) error {
   121  	b.l.Lock()
   122  	defer b.l.Unlock()
   123  
   124  	// Check if already enqueued
   125  	if _, ok := b.evals[eval.ID]; ok {
   126  		return nil
   127  	} else if b.enabled {
   128  		b.evals[eval.ID] = 0
   129  	}
   130  
   131  	// Check if we need to enforce a wait
   132  	if eval.Wait > 0 {
   133  		timer := time.AfterFunc(eval.Wait, func() {
   134  			b.enqueueWaiting(eval)
   135  		})
   136  		b.timeWait[eval.ID] = timer
   137  		b.stats.TotalWaiting += 1
   138  		return nil
   139  	}
   140  
   141  	b.enqueueLocked(eval, eval.Type)
   142  	return nil
   143  }
   144  
   145  // enqueueWaiting is used to enqueue a waiting evaluation
   146  func (b *EvalBroker) enqueueWaiting(eval *structs.Evaluation) {
   147  	b.l.Lock()
   148  	defer b.l.Unlock()
   149  	delete(b.timeWait, eval.ID)
   150  	b.stats.TotalWaiting -= 1
   151  	b.enqueueLocked(eval, eval.Type)
   152  }
   153  
   154  // enqueueLocked is used to enqueue with the lock held
   155  func (b *EvalBroker) enqueueLocked(eval *structs.Evaluation, queue string) {
   156  	// Do nothing if not enabled
   157  	if !b.enabled {
   158  		return
   159  	}
   160  
   161  	// Check if there is an evaluation for this JobID pending
   162  	pendingEval := b.jobEvals[eval.JobID]
   163  	if pendingEval == "" {
   164  		b.jobEvals[eval.JobID] = eval.ID
   165  	} else if pendingEval != eval.ID {
   166  		blocked := b.blocked[eval.JobID]
   167  		heap.Push(&blocked, eval)
   168  		b.blocked[eval.JobID] = blocked
   169  		b.stats.TotalBlocked += 1
   170  		return
   171  	}
   172  
   173  	// Find the pending by scheduler class
   174  	pending, ok := b.ready[queue]
   175  	if !ok {
   176  		pending = make([]*structs.Evaluation, 0, 16)
   177  		if _, ok := b.waiting[queue]; !ok {
   178  			b.waiting[queue] = make(chan struct{}, 1)
   179  		}
   180  	}
   181  
   182  	// Push onto the heap
   183  	heap.Push(&pending, eval)
   184  	b.ready[queue] = pending
   185  
   186  	// Update the stats
   187  	b.stats.TotalReady += 1
   188  	bySched, ok := b.stats.ByScheduler[queue]
   189  	if !ok {
   190  		bySched = &SchedulerStats{}
   191  		b.stats.ByScheduler[queue] = bySched
   192  	}
   193  	bySched.Ready += 1
   194  
   195  	// Unblock any blocked dequeues
   196  	select {
   197  	case b.waiting[queue] <- struct{}{}:
   198  	default:
   199  	}
   200  }
   201  
   202  // Dequeue is used to perform a blocking dequeue
   203  func (b *EvalBroker) Dequeue(schedulers []string, timeout time.Duration) (*structs.Evaluation, string, error) {
   204  	var timeoutTimer *time.Timer
   205  SCAN:
   206  	// Scan for work
   207  	eval, token, err := b.scanForSchedulers(schedulers)
   208  	if err != nil {
   209  		if timeoutTimer != nil {
   210  			timeoutTimer.Stop()
   211  		}
   212  		return nil, "", err
   213  	}
   214  
   215  	// Check if we have something
   216  	if eval != nil {
   217  		if timeoutTimer != nil {
   218  			timeoutTimer.Stop()
   219  		}
   220  		return eval, token, nil
   221  	}
   222  
   223  	// Setup the timeout channel the first time around
   224  	if timeoutTimer == nil && timeout != 0 {
   225  		timeoutTimer = time.NewTimer(timeout)
   226  	}
   227  
   228  	// Block until we get work
   229  	scan := b.waitForSchedulers(schedulers, timeoutTimer.C)
   230  	if scan {
   231  		goto SCAN
   232  	}
   233  	return nil, "", nil
   234  }
   235  
   236  // scanForSchedulers scans for work on any of the schedulers. The highest priority work
   237  // is dequeued first. This may return nothing if there is no work waiting.
   238  func (b *EvalBroker) scanForSchedulers(schedulers []string) (*structs.Evaluation, string, error) {
   239  	b.l.Lock()
   240  	defer b.l.Unlock()
   241  
   242  	// Do nothing if not enabled
   243  	if !b.enabled {
   244  		return nil, "", fmt.Errorf("eval broker disabled")
   245  	}
   246  
   247  	// Scan for eligible work
   248  	var eligibleSched []string
   249  	var eligiblePriority int
   250  	for _, sched := range schedulers {
   251  		// Get the pending queue
   252  		pending, ok := b.ready[sched]
   253  		if !ok {
   254  			continue
   255  		}
   256  
   257  		// Peek at the next item
   258  		ready := pending.Peek()
   259  		if ready == nil {
   260  			continue
   261  		}
   262  
   263  		// Add to eligible if equal or greater priority
   264  		if len(eligibleSched) == 0 || ready.Priority > eligiblePriority {
   265  			eligibleSched = []string{sched}
   266  			eligiblePriority = ready.Priority
   267  
   268  		} else if eligiblePriority > ready.Priority {
   269  			continue
   270  
   271  		} else if eligiblePriority == ready.Priority {
   272  			eligibleSched = append(eligibleSched, sched)
   273  		}
   274  	}
   275  
   276  	// Determine behavior based on eligible work
   277  	switch n := len(eligibleSched); n {
   278  	case 0:
   279  		// No work to do!
   280  		return nil, "", nil
   281  
   282  	case 1:
   283  		// Only a single task, dequeue
   284  		return b.dequeueForSched(eligibleSched[0])
   285  
   286  	default:
   287  		// Multiple tasks. We pick a random task so that we fairly
   288  		// distribute work.
   289  		offset := rand.Int63() % int64(n)
   290  		return b.dequeueForSched(eligibleSched[offset])
   291  	}
   292  }
   293  
   294  // dequeueForSched is used to dequeue the next work item for a given scheduler.
   295  // This assumes locks are held and that this scheduler has work
   296  func (b *EvalBroker) dequeueForSched(sched string) (*structs.Evaluation, string, error) {
   297  	// Get the pending queue
   298  	pending := b.ready[sched]
   299  	raw := heap.Pop(&pending)
   300  	b.ready[sched] = pending
   301  	eval := raw.(*structs.Evaluation)
   302  
   303  	// Generate a UUID for the token
   304  	token := structs.GenerateUUID()
   305  
   306  	// Setup Nack timer
   307  	nackTimer := time.AfterFunc(b.nackTimeout, func() {
   308  		b.Nack(eval.ID, token)
   309  	})
   310  
   311  	// Add to the unack queue
   312  	b.unack[eval.ID] = &unackEval{
   313  		Eval:      eval,
   314  		Token:     token,
   315  		NackTimer: nackTimer,
   316  	}
   317  
   318  	// Increment the dequeue count
   319  	b.evals[eval.ID] += 1
   320  
   321  	// Update the stats
   322  	b.stats.TotalReady -= 1
   323  	b.stats.TotalUnacked += 1
   324  	bySched := b.stats.ByScheduler[sched]
   325  	bySched.Ready -= 1
   326  	bySched.Unacked += 1
   327  
   328  	return eval, token, nil
   329  }
   330  
   331  // waitForSchedulers is used to wait for work on any of the scheduler or until a timeout.
   332  // Returns if there is work waiting potentially.
   333  func (b *EvalBroker) waitForSchedulers(schedulers []string, timeoutCh <-chan time.Time) bool {
   334  	doneCh := make(chan struct{})
   335  	readyCh := make(chan struct{}, 1)
   336  	defer close(doneCh)
   337  
   338  	// Start all the watchers
   339  	b.l.Lock()
   340  	for _, sched := range schedulers {
   341  		waitCh, ok := b.waiting[sched]
   342  		if !ok {
   343  			waitCh = make(chan struct{}, 1)
   344  			b.waiting[sched] = waitCh
   345  		}
   346  
   347  		// Start a goroutine that either waits for the waitCh on this scheduler
   348  		// to unblock or for this waitForSchedulers call to return
   349  		go func() {
   350  			select {
   351  			case <-waitCh:
   352  				select {
   353  				case readyCh <- struct{}{}:
   354  				default:
   355  				}
   356  			case <-doneCh:
   357  			}
   358  		}()
   359  	}
   360  	b.l.Unlock()
   361  
   362  	// Block until we have ready work and should scan, or until we timeout
   363  	// and should not make an attempt to scan for work
   364  	select {
   365  	case <-readyCh:
   366  		return true
   367  	case <-timeoutCh:
   368  		return false
   369  	}
   370  }
   371  
   372  // Outstanding checks if an EvalID has been delivered but not acknowledged
   373  // and returns the associated token for the evaluation.
   374  func (b *EvalBroker) Outstanding(evalID string) (string, bool) {
   375  	b.l.RLock()
   376  	defer b.l.RUnlock()
   377  	unack, ok := b.unack[evalID]
   378  	if !ok {
   379  		return "", false
   380  	}
   381  	return unack.Token, true
   382  }
   383  
   384  // Ack is used to positively acknowledge handling an evaluation
   385  func (b *EvalBroker) Ack(evalID, token string) error {
   386  	b.l.Lock()
   387  	defer b.l.Unlock()
   388  
   389  	// Lookup the unack'd eval
   390  	unack, ok := b.unack[evalID]
   391  	if !ok {
   392  		return fmt.Errorf("Evaluation ID not found")
   393  	}
   394  	if unack.Token != token {
   395  		return fmt.Errorf("Token does not match for Evaluation ID")
   396  	}
   397  	jobID := unack.Eval.JobID
   398  
   399  	// Ensure we were able to stop the timer
   400  	if !unack.NackTimer.Stop() {
   401  		return fmt.Errorf("Evaluation ID Ack'd after Nack timer expiration")
   402  	}
   403  
   404  	// Update the stats
   405  	b.stats.TotalUnacked -= 1
   406  	queue := unack.Eval.Type
   407  	if b.evals[evalID] >= b.deliveryLimit {
   408  		queue = failedQueue
   409  	}
   410  	bySched := b.stats.ByScheduler[queue]
   411  	bySched.Unacked -= 1
   412  
   413  	// Cleanup
   414  	delete(b.unack, evalID)
   415  	delete(b.evals, evalID)
   416  	delete(b.jobEvals, jobID)
   417  
   418  	// Check if there are any blocked evaluations
   419  	if blocked := b.blocked[jobID]; len(blocked) != 0 {
   420  		raw := heap.Pop(&blocked)
   421  		if len(blocked) > 0 {
   422  			b.blocked[jobID] = blocked
   423  		} else {
   424  			delete(b.blocked, jobID)
   425  		}
   426  		eval := raw.(*structs.Evaluation)
   427  		b.stats.TotalBlocked -= 1
   428  		b.enqueueLocked(eval, eval.Type)
   429  		return nil
   430  	}
   431  	return nil
   432  }
   433  
   434  // Nack is used to negatively acknowledge handling an evaluation
   435  func (b *EvalBroker) Nack(evalID, token string) error {
   436  	b.l.Lock()
   437  	defer b.l.Unlock()
   438  
   439  	// Lookup the unack'd eval
   440  	unack, ok := b.unack[evalID]
   441  	if !ok {
   442  		return fmt.Errorf("Evaluation ID not found")
   443  	}
   444  	if unack.Token != token {
   445  		return fmt.Errorf("Token does not match for Evaluation ID")
   446  	}
   447  
   448  	// Stop the timer, doesn't matter if we've missed it
   449  	unack.NackTimer.Stop()
   450  
   451  	// Cleanup
   452  	delete(b.unack, evalID)
   453  
   454  	// Update the stats
   455  	b.stats.TotalUnacked -= 1
   456  	bySched := b.stats.ByScheduler[unack.Eval.Type]
   457  	bySched.Unacked -= 1
   458  
   459  	// Check if we've hit the delivery limit, and re-enqueue
   460  	// in the failedQueue
   461  	if b.evals[evalID] >= b.deliveryLimit {
   462  		b.enqueueLocked(unack.Eval, failedQueue)
   463  	} else {
   464  		b.enqueueLocked(unack.Eval, unack.Eval.Type)
   465  	}
   466  	return nil
   467  }
   468  
   469  // Flush is used to clear the state of the broker
   470  func (b *EvalBroker) Flush() {
   471  	b.l.Lock()
   472  	defer b.l.Unlock()
   473  
   474  	// Unblock any waiters
   475  	for _, waitCh := range b.waiting {
   476  		close(waitCh)
   477  	}
   478  	b.waiting = make(map[string]chan struct{})
   479  
   480  	// Cancel any Nack timers
   481  	for _, unack := range b.unack {
   482  		unack.NackTimer.Stop()
   483  	}
   484  
   485  	// Cancel any time wait evals
   486  	for _, wait := range b.timeWait {
   487  		wait.Stop()
   488  	}
   489  
   490  	// Reset the broker
   491  	b.stats.TotalReady = 0
   492  	b.stats.TotalUnacked = 0
   493  	b.stats.TotalBlocked = 0
   494  	b.stats.TotalWaiting = 0
   495  	b.stats.ByScheduler = make(map[string]*SchedulerStats)
   496  	b.evals = make(map[string]int)
   497  	b.jobEvals = make(map[string]string)
   498  	b.blocked = make(map[string]PendingEvaluations)
   499  	b.ready = make(map[string]PendingEvaluations)
   500  	b.unack = make(map[string]*unackEval)
   501  	b.timeWait = make(map[string]*time.Timer)
   502  }
   503  
   504  // Stats is used to query the state of the broker
   505  func (b *EvalBroker) Stats() *BrokerStats {
   506  	// Allocate a new stats struct
   507  	stats := new(BrokerStats)
   508  	stats.ByScheduler = make(map[string]*SchedulerStats)
   509  
   510  	b.l.RLock()
   511  	defer b.l.RUnlock()
   512  
   513  	// Copy all the stats
   514  	stats.TotalReady = b.stats.TotalReady
   515  	stats.TotalUnacked = b.stats.TotalUnacked
   516  	stats.TotalBlocked = b.stats.TotalBlocked
   517  	stats.TotalWaiting = b.stats.TotalWaiting
   518  	for sched, subStat := range b.stats.ByScheduler {
   519  		subStatCopy := new(SchedulerStats)
   520  		*subStatCopy = *subStat
   521  		stats.ByScheduler[sched] = subStatCopy
   522  	}
   523  	return stats
   524  }
   525  
   526  // EmitStats is used to export metrics about the broker while enabled
   527  func (b *EvalBroker) EmitStats(period time.Duration, stopCh chan struct{}) {
   528  	for {
   529  		select {
   530  		case <-time.After(period):
   531  			stats := b.Stats()
   532  			metrics.SetGauge([]string{"nomad", "broker", "total_ready"}, float32(stats.TotalReady))
   533  			metrics.SetGauge([]string{"nomad", "broker", "total_unacked"}, float32(stats.TotalUnacked))
   534  			metrics.SetGauge([]string{"nomad", "broker", "total_blocked"}, float32(stats.TotalBlocked))
   535  			metrics.SetGauge([]string{"nomad", "broker", "total_waiting"}, float32(stats.TotalWaiting))
   536  			for sched, schedStats := range stats.ByScheduler {
   537  				metrics.SetGauge([]string{"nomad", "broker", sched, "ready"}, float32(schedStats.Ready))
   538  				metrics.SetGauge([]string{"nomad", "broker", sched, "unacked"}, float32(schedStats.Unacked))
   539  			}
   540  
   541  		case <-stopCh:
   542  			return
   543  		}
   544  	}
   545  }
   546  
   547  // BrokerStats returns all the stats about the broker
   548  type BrokerStats struct {
   549  	TotalReady   int
   550  	TotalUnacked int
   551  	TotalBlocked int
   552  	TotalWaiting int
   553  	ByScheduler  map[string]*SchedulerStats
   554  }
   555  
   556  // SchedulerStats returns the stats per scheduler
   557  type SchedulerStats struct {
   558  	Ready   int
   559  	Unacked int
   560  }
   561  
   562  // Len is for the sorting interface
   563  func (p PendingEvaluations) Len() int {
   564  	return len(p)
   565  }
   566  
   567  // Less is for the sorting interface. We flip the check
   568  // so that the "min" in the min-heap is the element with the
   569  // highest priority
   570  func (p PendingEvaluations) Less(i, j int) bool {
   571  	if p[i].JobID != p[j].JobID && p[i].Priority != p[j].Priority {
   572  		return !(p[i].Priority < p[j].Priority)
   573  	}
   574  	return p[i].CreateIndex < p[j].CreateIndex
   575  }
   576  
   577  // Swap is for the sorting interface
   578  func (p PendingEvaluations) Swap(i, j int) {
   579  	p[i], p[j] = p[j], p[i]
   580  }
   581  
   582  // Push is used to add a new evalution to the slice
   583  func (p *PendingEvaluations) Push(e interface{}) {
   584  	*p = append(*p, e.(*structs.Evaluation))
   585  }
   586  
   587  // Pop is used to remove an evaluation from the slice
   588  func (p *PendingEvaluations) Pop() interface{} {
   589  	n := len(*p)
   590  	e := (*p)[n-1]
   591  	(*p)[n-1] = nil
   592  	*p = (*p)[:n-1]
   593  	return e
   594  }
   595  
   596  // Peek is used to peek at the next element that would be popped
   597  func (p PendingEvaluations) Peek() *structs.Evaluation {
   598  	n := len(p)
   599  	if n == 0 {
   600  		return nil
   601  	}
   602  	return p[n-1]
   603  }