github.com/dkerwin/nomad@v0.3.3-0.20160525181927-74554135514b/nomad/eval_broker.go (about) 1 package nomad 2 3 import ( 4 "container/heap" 5 "errors" 6 "fmt" 7 "math/rand" 8 "sync" 9 "time" 10 11 "github.com/armon/go-metrics" 12 "github.com/hashicorp/nomad/nomad/structs" 13 ) 14 15 const ( 16 // failedQueue is the queue we add Evaluations to once 17 // they've reached the deliveryLimit. This allows the leader to 18 // set the status to failed. 19 failedQueue = "_failed" 20 ) 21 22 var ( 23 // ErrNotOutstanding is returned if an evaluation is not outstanding 24 ErrNotOutstanding = errors.New("evaluation is not outstanding") 25 26 // ErrTokenMismatch is the outstanding eval has a different token 27 ErrTokenMismatch = errors.New("evaluation token does not match") 28 29 // ErrNackTimeoutReached is returned if an expired evaluation is reset 30 ErrNackTimeoutReached = errors.New("evaluation nack timeout reached") 31 ) 32 33 // EvalBroker is used to manage brokering of evaluations. When an evaluation is 34 // created, due to a change in a job specification or a node, we put it into the 35 // broker. The broker sorts by evaluations by priority and scheduler type. This 36 // allows us to dequeue the highest priority work first, while also allowing sub-schedulers 37 // to only dequeue work they know how to handle. The broker is designed to be entirely 38 // in-memory and is managed by the leader node. 39 // 40 // The broker must provide at-least-once delivery semantics. It relies on explicit 41 // Ack/Nack messages to handle this. If a delivery is not Ack'd in a sufficient time 42 // span, it will be assumed Nack'd. 43 type EvalBroker struct { 44 nackTimeout time.Duration 45 deliveryLimit int 46 47 enabled bool 48 stats *BrokerStats 49 50 // evals tracks queued evaluations by ID to de-duplicate enqueue. 51 // The counter is the number of times we've attempted delivery, 52 // and is used to eventually fail an evaluation. 53 evals map[string]int 54 55 // jobEvals tracks queued evaluations by JobID to serialize them 56 jobEvals map[string]string 57 58 // blocked tracks the blocked evaluations by JobID in a priority queue 59 blocked map[string]PendingEvaluations 60 61 // ready tracks the ready jobs by scheduler in a priority queue 62 ready map[string]PendingEvaluations 63 64 // unack is a map of evalID to an un-acknowledged evaluation 65 unack map[string]*unackEval 66 67 // waiting is used to notify on a per-scheduler basis of ready work 68 waiting map[string]chan struct{} 69 70 // timeWait has evaluations that are waiting for time to elapse 71 timeWait map[string]*time.Timer 72 73 l sync.RWMutex 74 } 75 76 // unackEval tracks an unacknowledged evaluation along with the Nack timer 77 type unackEval struct { 78 Eval *structs.Evaluation 79 Token string 80 NackTimer *time.Timer 81 } 82 83 // PendingEvaluations is a list of waiting evaluations. 84 // We implement the container/heap interface so that this is a 85 // priority queue 86 type PendingEvaluations []*structs.Evaluation 87 88 // NewEvalBroker creates a new evaluation broker. This is parameterized 89 // with the timeout used for messages that are not acknowledged before we 90 // assume a Nack and attempt to redeliver as well as the deliveryLimit 91 // which prevents a failing eval from being endlessly delivered. 92 func NewEvalBroker(timeout time.Duration, deliveryLimit int) (*EvalBroker, error) { 93 if timeout < 0 { 94 return nil, fmt.Errorf("timeout cannot be negative") 95 } 96 b := &EvalBroker{ 97 nackTimeout: timeout, 98 deliveryLimit: deliveryLimit, 99 enabled: false, 100 stats: new(BrokerStats), 101 evals: make(map[string]int), 102 jobEvals: make(map[string]string), 103 blocked: make(map[string]PendingEvaluations), 104 ready: make(map[string]PendingEvaluations), 105 unack: make(map[string]*unackEval), 106 waiting: make(map[string]chan struct{}), 107 timeWait: make(map[string]*time.Timer), 108 } 109 b.stats.ByScheduler = make(map[string]*SchedulerStats) 110 return b, nil 111 } 112 113 // Enabled is used to check if the broker is enabled. 114 func (b *EvalBroker) Enabled() bool { 115 b.l.RLock() 116 defer b.l.RUnlock() 117 return b.enabled 118 } 119 120 // SetEnabled is used to control if the broker is enabled. The broker 121 // should only be enabled on the active leader. 122 func (b *EvalBroker) SetEnabled(enabled bool) { 123 b.l.Lock() 124 b.enabled = enabled 125 b.l.Unlock() 126 if !enabled { 127 b.Flush() 128 } 129 } 130 131 // EnqueueAll is used to enqueue many evaluations. 132 func (b *EvalBroker) EnqueueAll(evals []*structs.Evaluation) { 133 // The lock needs to be held until all evaluations are enqueued. This is so 134 // that when Dequeue operations are unblocked they will pick the highest 135 // priority evaluations. 136 b.l.Lock() 137 defer b.l.Unlock() 138 for _, eval := range evals { 139 b.processEnqueue(eval) 140 } 141 } 142 143 // processEnqueue deduplicates evals and either enqueue immediately 144 // or enforce the evals wait time. processEnqueue must be called with the lock 145 // held. 146 func (b *EvalBroker) processEnqueue(eval *structs.Evaluation) { 147 // Check if already enqueued 148 if _, ok := b.evals[eval.ID]; ok { 149 return 150 } else if b.enabled { 151 b.evals[eval.ID] = 0 152 } 153 154 // Check if we need to enforce a wait 155 if eval.Wait > 0 { 156 timer := time.AfterFunc(eval.Wait, func() { 157 b.enqueueWaiting(eval) 158 }) 159 b.timeWait[eval.ID] = timer 160 b.stats.TotalWaiting += 1 161 return 162 } 163 164 b.enqueueLocked(eval, eval.Type) 165 } 166 167 // Enqueue is used to enqueue an evaluation 168 func (b *EvalBroker) Enqueue(eval *structs.Evaluation) { 169 b.l.Lock() 170 defer b.l.Unlock() 171 b.processEnqueue(eval) 172 } 173 174 // enqueueWaiting is used to enqueue a waiting evaluation 175 func (b *EvalBroker) enqueueWaiting(eval *structs.Evaluation) { 176 b.l.Lock() 177 defer b.l.Unlock() 178 delete(b.timeWait, eval.ID) 179 b.stats.TotalWaiting -= 1 180 b.enqueueLocked(eval, eval.Type) 181 } 182 183 // enqueueLocked is used to enqueue with the lock held 184 func (b *EvalBroker) enqueueLocked(eval *structs.Evaluation, queue string) { 185 // Do nothing if not enabled 186 if !b.enabled { 187 return 188 } 189 190 // Check if there is an evaluation for this JobID pending 191 pendingEval := b.jobEvals[eval.JobID] 192 if pendingEval == "" { 193 b.jobEvals[eval.JobID] = eval.ID 194 } else if pendingEval != eval.ID { 195 blocked := b.blocked[eval.JobID] 196 heap.Push(&blocked, eval) 197 b.blocked[eval.JobID] = blocked 198 b.stats.TotalBlocked += 1 199 return 200 } 201 202 // Find the pending by scheduler class 203 pending, ok := b.ready[queue] 204 if !ok { 205 pending = make([]*structs.Evaluation, 0, 16) 206 if _, ok := b.waiting[queue]; !ok { 207 b.waiting[queue] = make(chan struct{}, 1) 208 } 209 } 210 211 // Push onto the heap 212 heap.Push(&pending, eval) 213 b.ready[queue] = pending 214 215 // Update the stats 216 b.stats.TotalReady += 1 217 bySched, ok := b.stats.ByScheduler[queue] 218 if !ok { 219 bySched = &SchedulerStats{} 220 b.stats.ByScheduler[queue] = bySched 221 } 222 bySched.Ready += 1 223 224 // Unblock any blocked dequeues 225 select { 226 case b.waiting[queue] <- struct{}{}: 227 default: 228 } 229 } 230 231 // Dequeue is used to perform a blocking dequeue 232 func (b *EvalBroker) Dequeue(schedulers []string, timeout time.Duration) (*structs.Evaluation, string, error) { 233 var timeoutTimer *time.Timer 234 var timeoutCh <-chan time.Time 235 SCAN: 236 // Scan for work 237 eval, token, err := b.scanForSchedulers(schedulers) 238 if err != nil { 239 if timeoutTimer != nil { 240 timeoutTimer.Stop() 241 } 242 return nil, "", err 243 } 244 245 // Check if we have something 246 if eval != nil { 247 if timeoutTimer != nil { 248 timeoutTimer.Stop() 249 } 250 return eval, token, nil 251 } 252 253 // Setup the timeout channel the first time around 254 if timeoutTimer == nil && timeout != 0 { 255 timeoutTimer = time.NewTimer(timeout) 256 timeoutCh = timeoutTimer.C 257 } 258 259 // Block until we get work 260 scan := b.waitForSchedulers(schedulers, timeoutCh) 261 if scan { 262 goto SCAN 263 } 264 return nil, "", nil 265 } 266 267 // scanForSchedulers scans for work on any of the schedulers. The highest priority work 268 // is dequeued first. This may return nothing if there is no work waiting. 269 func (b *EvalBroker) scanForSchedulers(schedulers []string) (*structs.Evaluation, string, error) { 270 b.l.Lock() 271 defer b.l.Unlock() 272 273 // Do nothing if not enabled 274 if !b.enabled { 275 return nil, "", fmt.Errorf("eval broker disabled") 276 } 277 278 // Scan for eligible work 279 var eligibleSched []string 280 var eligiblePriority int 281 for _, sched := range schedulers { 282 // Get the pending queue 283 pending, ok := b.ready[sched] 284 if !ok { 285 continue 286 } 287 288 // Peek at the next item 289 ready := pending.Peek() 290 if ready == nil { 291 continue 292 } 293 294 // Add to eligible if equal or greater priority 295 if len(eligibleSched) == 0 || ready.Priority > eligiblePriority { 296 eligibleSched = []string{sched} 297 eligiblePriority = ready.Priority 298 299 } else if eligiblePriority > ready.Priority { 300 continue 301 302 } else if eligiblePriority == ready.Priority { 303 eligibleSched = append(eligibleSched, sched) 304 } 305 } 306 307 // Determine behavior based on eligible work 308 switch n := len(eligibleSched); n { 309 case 0: 310 // No work to do! 311 return nil, "", nil 312 313 case 1: 314 // Only a single task, dequeue 315 return b.dequeueForSched(eligibleSched[0]) 316 317 default: 318 // Multiple tasks. We pick a random task so that we fairly 319 // distribute work. 320 offset := rand.Int63() % int64(n) 321 return b.dequeueForSched(eligibleSched[offset]) 322 } 323 } 324 325 // dequeueForSched is used to dequeue the next work item for a given scheduler. 326 // This assumes locks are held and that this scheduler has work 327 func (b *EvalBroker) dequeueForSched(sched string) (*structs.Evaluation, string, error) { 328 // Get the pending queue 329 pending := b.ready[sched] 330 raw := heap.Pop(&pending) 331 b.ready[sched] = pending 332 eval := raw.(*structs.Evaluation) 333 334 // Generate a UUID for the token 335 token := structs.GenerateUUID() 336 337 // Setup Nack timer 338 nackTimer := time.AfterFunc(b.nackTimeout, func() { 339 b.Nack(eval.ID, token) 340 }) 341 342 // Add to the unack queue 343 b.unack[eval.ID] = &unackEval{ 344 Eval: eval, 345 Token: token, 346 NackTimer: nackTimer, 347 } 348 349 // Increment the dequeue count 350 b.evals[eval.ID] += 1 351 352 // Update the stats 353 b.stats.TotalReady -= 1 354 b.stats.TotalUnacked += 1 355 bySched := b.stats.ByScheduler[sched] 356 bySched.Ready -= 1 357 bySched.Unacked += 1 358 359 return eval, token, nil 360 } 361 362 // waitForSchedulers is used to wait for work on any of the scheduler or until a timeout. 363 // Returns if there is work waiting potentially. 364 func (b *EvalBroker) waitForSchedulers(schedulers []string, timeoutCh <-chan time.Time) bool { 365 doneCh := make(chan struct{}) 366 readyCh := make(chan struct{}, 1) 367 defer close(doneCh) 368 369 // Start all the watchers 370 b.l.Lock() 371 for _, sched := range schedulers { 372 waitCh, ok := b.waiting[sched] 373 if !ok { 374 waitCh = make(chan struct{}, 1) 375 b.waiting[sched] = waitCh 376 } 377 378 // Start a goroutine that either waits for the waitCh on this scheduler 379 // to unblock or for this waitForSchedulers call to return 380 go func() { 381 select { 382 case <-waitCh: 383 select { 384 case readyCh <- struct{}{}: 385 default: 386 } 387 case <-doneCh: 388 } 389 }() 390 } 391 b.l.Unlock() 392 393 // Block until we have ready work and should scan, or until we timeout 394 // and should not make an attempt to scan for work 395 select { 396 case <-readyCh: 397 return true 398 case <-timeoutCh: 399 return false 400 } 401 } 402 403 // Outstanding checks if an EvalID has been delivered but not acknowledged 404 // and returns the associated token for the evaluation. 405 func (b *EvalBroker) Outstanding(evalID string) (string, bool) { 406 b.l.RLock() 407 defer b.l.RUnlock() 408 unack, ok := b.unack[evalID] 409 if !ok { 410 return "", false 411 } 412 return unack.Token, true 413 } 414 415 // OutstandingReset resets the Nack timer for the EvalID if the 416 // token matches and the eval is outstanding 417 func (b *EvalBroker) OutstandingReset(evalID, token string) error { 418 b.l.RLock() 419 defer b.l.RUnlock() 420 unack, ok := b.unack[evalID] 421 if !ok { 422 return ErrNotOutstanding 423 } 424 if unack.Token != token { 425 return ErrTokenMismatch 426 } 427 if !unack.NackTimer.Reset(b.nackTimeout) { 428 return ErrNackTimeoutReached 429 } 430 return nil 431 } 432 433 // Ack is used to positively acknowledge handling an evaluation 434 func (b *EvalBroker) Ack(evalID, token string) error { 435 b.l.Lock() 436 defer b.l.Unlock() 437 438 // Lookup the unack'd eval 439 unack, ok := b.unack[evalID] 440 if !ok { 441 return fmt.Errorf("Evaluation ID not found") 442 } 443 if unack.Token != token { 444 return fmt.Errorf("Token does not match for Evaluation ID") 445 } 446 jobID := unack.Eval.JobID 447 448 // Ensure we were able to stop the timer 449 if !unack.NackTimer.Stop() { 450 return fmt.Errorf("Evaluation ID Ack'd after Nack timer expiration") 451 } 452 453 // Update the stats 454 b.stats.TotalUnacked -= 1 455 queue := unack.Eval.Type 456 if b.evals[evalID] > b.deliveryLimit { 457 queue = failedQueue 458 } 459 bySched := b.stats.ByScheduler[queue] 460 bySched.Unacked -= 1 461 462 // Cleanup 463 delete(b.unack, evalID) 464 delete(b.evals, evalID) 465 delete(b.jobEvals, jobID) 466 467 // Check if there are any blocked evaluations 468 if blocked := b.blocked[jobID]; len(blocked) != 0 { 469 raw := heap.Pop(&blocked) 470 if len(blocked) > 0 { 471 b.blocked[jobID] = blocked 472 } else { 473 delete(b.blocked, jobID) 474 } 475 eval := raw.(*structs.Evaluation) 476 b.stats.TotalBlocked -= 1 477 b.enqueueLocked(eval, eval.Type) 478 return nil 479 } 480 return nil 481 } 482 483 // Nack is used to negatively acknowledge handling an evaluation 484 func (b *EvalBroker) Nack(evalID, token string) error { 485 b.l.Lock() 486 defer b.l.Unlock() 487 488 // Lookup the unack'd eval 489 unack, ok := b.unack[evalID] 490 if !ok { 491 return fmt.Errorf("Evaluation ID not found") 492 } 493 if unack.Token != token { 494 return fmt.Errorf("Token does not match for Evaluation ID") 495 } 496 497 // Stop the timer, doesn't matter if we've missed it 498 unack.NackTimer.Stop() 499 500 // Cleanup 501 delete(b.unack, evalID) 502 503 // Update the stats 504 b.stats.TotalUnacked -= 1 505 bySched := b.stats.ByScheduler[unack.Eval.Type] 506 bySched.Unacked -= 1 507 508 // Check if we've hit the delivery limit, and re-enqueue 509 // in the failedQueue 510 if b.evals[evalID] >= b.deliveryLimit { 511 b.enqueueLocked(unack.Eval, failedQueue) 512 } else { 513 b.enqueueLocked(unack.Eval, unack.Eval.Type) 514 } 515 return nil 516 } 517 518 // PauseNackTimeout is used to pause the Nack timeout for an eval that is making 519 // progress but is in a potentially unbounded operation such as the plan queue. 520 func (b *EvalBroker) PauseNackTimeout(evalID, token string) error { 521 b.l.RLock() 522 defer b.l.RUnlock() 523 unack, ok := b.unack[evalID] 524 if !ok { 525 return ErrNotOutstanding 526 } 527 if unack.Token != token { 528 return ErrTokenMismatch 529 } 530 if !unack.NackTimer.Stop() { 531 return ErrNackTimeoutReached 532 } 533 return nil 534 } 535 536 // ResumeNackTimeout is used to resume the Nack timeout for an eval that was 537 // paused. It should be resumed after leaving an unbounded operation. 538 func (b *EvalBroker) ResumeNackTimeout(evalID, token string) error { 539 b.l.Lock() 540 defer b.l.Unlock() 541 unack, ok := b.unack[evalID] 542 if !ok { 543 return ErrNotOutstanding 544 } 545 if unack.Token != token { 546 return ErrTokenMismatch 547 } 548 unack.NackTimer.Reset(b.nackTimeout) 549 return nil 550 } 551 552 // Flush is used to clear the state of the broker 553 func (b *EvalBroker) Flush() { 554 b.l.Lock() 555 defer b.l.Unlock() 556 557 // Unblock any waiters 558 for _, waitCh := range b.waiting { 559 close(waitCh) 560 } 561 b.waiting = make(map[string]chan struct{}) 562 563 // Cancel any Nack timers 564 for _, unack := range b.unack { 565 unack.NackTimer.Stop() 566 } 567 568 // Cancel any time wait evals 569 for _, wait := range b.timeWait { 570 wait.Stop() 571 } 572 573 // Reset the broker 574 b.stats.TotalReady = 0 575 b.stats.TotalUnacked = 0 576 b.stats.TotalBlocked = 0 577 b.stats.TotalWaiting = 0 578 b.stats.ByScheduler = make(map[string]*SchedulerStats) 579 b.evals = make(map[string]int) 580 b.jobEvals = make(map[string]string) 581 b.blocked = make(map[string]PendingEvaluations) 582 b.ready = make(map[string]PendingEvaluations) 583 b.unack = make(map[string]*unackEval) 584 b.timeWait = make(map[string]*time.Timer) 585 } 586 587 // Stats is used to query the state of the broker 588 func (b *EvalBroker) Stats() *BrokerStats { 589 // Allocate a new stats struct 590 stats := new(BrokerStats) 591 stats.ByScheduler = make(map[string]*SchedulerStats) 592 593 b.l.RLock() 594 defer b.l.RUnlock() 595 596 // Copy all the stats 597 stats.TotalReady = b.stats.TotalReady 598 stats.TotalUnacked = b.stats.TotalUnacked 599 stats.TotalBlocked = b.stats.TotalBlocked 600 stats.TotalWaiting = b.stats.TotalWaiting 601 for sched, subStat := range b.stats.ByScheduler { 602 subStatCopy := new(SchedulerStats) 603 *subStatCopy = *subStat 604 stats.ByScheduler[sched] = subStatCopy 605 } 606 return stats 607 } 608 609 // EmitStats is used to export metrics about the broker while enabled 610 func (b *EvalBroker) EmitStats(period time.Duration, stopCh chan struct{}) { 611 for { 612 select { 613 case <-time.After(period): 614 stats := b.Stats() 615 metrics.SetGauge([]string{"nomad", "broker", "total_ready"}, float32(stats.TotalReady)) 616 metrics.SetGauge([]string{"nomad", "broker", "total_unacked"}, float32(stats.TotalUnacked)) 617 metrics.SetGauge([]string{"nomad", "broker", "total_blocked"}, float32(stats.TotalBlocked)) 618 metrics.SetGauge([]string{"nomad", "broker", "total_waiting"}, float32(stats.TotalWaiting)) 619 for sched, schedStats := range stats.ByScheduler { 620 metrics.SetGauge([]string{"nomad", "broker", sched, "ready"}, float32(schedStats.Ready)) 621 metrics.SetGauge([]string{"nomad", "broker", sched, "unacked"}, float32(schedStats.Unacked)) 622 } 623 624 case <-stopCh: 625 return 626 } 627 } 628 } 629 630 // BrokerStats returns all the stats about the broker 631 type BrokerStats struct { 632 TotalReady int 633 TotalUnacked int 634 TotalBlocked int 635 TotalWaiting int 636 ByScheduler map[string]*SchedulerStats 637 } 638 639 // SchedulerStats returns the stats per scheduler 640 type SchedulerStats struct { 641 Ready int 642 Unacked int 643 } 644 645 // Len is for the sorting interface 646 func (p PendingEvaluations) Len() int { 647 return len(p) 648 } 649 650 // Less is for the sorting interface. We flip the check 651 // so that the "min" in the min-heap is the element with the 652 // highest priority 653 func (p PendingEvaluations) Less(i, j int) bool { 654 if p[i].JobID != p[j].JobID && p[i].Priority != p[j].Priority { 655 return !(p[i].Priority < p[j].Priority) 656 } 657 return p[i].CreateIndex < p[j].CreateIndex 658 } 659 660 // Swap is for the sorting interface 661 func (p PendingEvaluations) Swap(i, j int) { 662 p[i], p[j] = p[j], p[i] 663 } 664 665 // Push is used to add a new evalution to the slice 666 func (p *PendingEvaluations) Push(e interface{}) { 667 *p = append(*p, e.(*structs.Evaluation)) 668 } 669 670 // Pop is used to remove an evaluation from the slice 671 func (p *PendingEvaluations) Pop() interface{} { 672 n := len(*p) 673 e := (*p)[n-1] 674 (*p)[n-1] = nil 675 *p = (*p)[:n-1] 676 return e 677 } 678 679 // Peek is used to peek at the next element that would be popped 680 func (p PendingEvaluations) Peek() *structs.Evaluation { 681 n := len(p) 682 if n == 0 { 683 return nil 684 } 685 return p[n-1] 686 }