github.com/taylorchu/nomad@v0.5.3-rc1.0.20170407200202-db11e7dd7b55/nomad/eval_broker.go (about) 1 package nomad 2 3 import ( 4 "container/heap" 5 "errors" 6 "fmt" 7 "math/rand" 8 "sync" 9 "time" 10 11 "github.com/armon/go-metrics" 12 "github.com/hashicorp/nomad/nomad/structs" 13 ) 14 15 const ( 16 // failedQueue is the queue we add Evaluations to once 17 // they've reached the deliveryLimit. This allows the leader to 18 // set the status to failed. 19 failedQueue = "_failed" 20 ) 21 22 var ( 23 // ErrNotOutstanding is returned if an evaluation is not outstanding 24 ErrNotOutstanding = errors.New("evaluation is not outstanding") 25 26 // ErrTokenMismatch is the outstanding eval has a different token 27 ErrTokenMismatch = errors.New("evaluation token does not match") 28 29 // ErrNackTimeoutReached is returned if an expired evaluation is reset 30 ErrNackTimeoutReached = errors.New("evaluation nack timeout reached") 31 ) 32 33 // EvalBroker is used to manage brokering of evaluations. When an evaluation is 34 // created, due to a change in a job specification or a node, we put it into the 35 // broker. The broker sorts by evaluations by priority and scheduler type. This 36 // allows us to dequeue the highest priority work first, while also allowing sub-schedulers 37 // to only dequeue work they know how to handle. The broker is designed to be entirely 38 // in-memory and is managed by the leader node. 39 // 40 // The broker must provide at-least-once delivery semantics. It relies on explicit 41 // Ack/Nack messages to handle this. If a delivery is not Ack'd in a sufficient time 42 // span, it will be assumed Nack'd. 43 type EvalBroker struct { 44 nackTimeout time.Duration 45 deliveryLimit int 46 47 enabled bool 48 stats *BrokerStats 49 50 // evals tracks queued evaluations by ID to de-duplicate enqueue. 51 // The counter is the number of times we've attempted delivery, 52 // and is used to eventually fail an evaluation. 53 evals map[string]int 54 55 // jobEvals tracks queued evaluations by JobID to serialize them 56 jobEvals map[string]string 57 58 // blocked tracks the blocked evaluations by JobID in a priority queue 59 blocked map[string]PendingEvaluations 60 61 // ready tracks the ready jobs by scheduler in a priority queue 62 ready map[string]PendingEvaluations 63 64 // unack is a map of evalID to an un-acknowledged evaluation 65 unack map[string]*unackEval 66 67 // waiting is used to notify on a per-scheduler basis of ready work 68 waiting map[string]chan struct{} 69 70 // requeue tracks evaluations that need to be re-enqueued once the current 71 // evaluation finishes by token. If the token is Nacked or rejected the 72 // evaluation is dropped but if Acked successfully, the evaluation is 73 // queued. 74 requeue map[string]*structs.Evaluation 75 76 // timeWait has evaluations that are waiting for time to elapse 77 timeWait map[string]*time.Timer 78 79 l sync.RWMutex 80 } 81 82 // unackEval tracks an unacknowledged evaluation along with the Nack timer 83 type unackEval struct { 84 Eval *structs.Evaluation 85 Token string 86 NackTimer *time.Timer 87 } 88 89 // PendingEvaluations is a list of waiting evaluations. 90 // We implement the container/heap interface so that this is a 91 // priority queue 92 type PendingEvaluations []*structs.Evaluation 93 94 // NewEvalBroker creates a new evaluation broker. This is parameterized 95 // with the timeout used for messages that are not acknowledged before we 96 // assume a Nack and attempt to redeliver as well as the deliveryLimit 97 // which prevents a failing eval from being endlessly delivered. 98 func NewEvalBroker(timeout time.Duration, deliveryLimit int) (*EvalBroker, error) { 99 if timeout < 0 { 100 return nil, fmt.Errorf("timeout cannot be negative") 101 } 102 b := &EvalBroker{ 103 nackTimeout: timeout, 104 deliveryLimit: deliveryLimit, 105 enabled: false, 106 stats: new(BrokerStats), 107 evals: make(map[string]int), 108 jobEvals: make(map[string]string), 109 blocked: make(map[string]PendingEvaluations), 110 ready: make(map[string]PendingEvaluations), 111 unack: make(map[string]*unackEval), 112 waiting: make(map[string]chan struct{}), 113 requeue: make(map[string]*structs.Evaluation), 114 timeWait: make(map[string]*time.Timer), 115 } 116 b.stats.ByScheduler = make(map[string]*SchedulerStats) 117 return b, nil 118 } 119 120 // Enabled is used to check if the broker is enabled. 121 func (b *EvalBroker) Enabled() bool { 122 b.l.RLock() 123 defer b.l.RUnlock() 124 return b.enabled 125 } 126 127 // SetEnabled is used to control if the broker is enabled. The broker 128 // should only be enabled on the active leader. 129 func (b *EvalBroker) SetEnabled(enabled bool) { 130 b.l.Lock() 131 b.enabled = enabled 132 b.l.Unlock() 133 if !enabled { 134 b.Flush() 135 } 136 } 137 138 // Enqueue is used to enqueue a new evaluation 139 func (b *EvalBroker) Enqueue(eval *structs.Evaluation) { 140 b.l.Lock() 141 defer b.l.Unlock() 142 b.processEnqueue(eval, "") 143 } 144 145 // EnqueueAll is used to enqueue many evaluations. The map allows evaluations 146 // that are being re-enqueued to include their token. 147 // 148 // When requeueing an evaluation that potentially may be already 149 // enqueued. The evaluation is handled in one of the following ways: 150 // * Evaluation not outstanding: Process as a normal Enqueue 151 // * Evaluation outstanding: Do not allow the evaluation to be dequeued til: 152 // * Ack received: Unblock the evaluation allowing it to be dequeued 153 // * Nack received: Drop the evaluation as it was created as a result of a 154 // scheduler run that was Nack'd 155 func (b *EvalBroker) EnqueueAll(evals map[*structs.Evaluation]string) { 156 // The lock needs to be held until all evaluations are enqueued. This is so 157 // that when Dequeue operations are unblocked they will pick the highest 158 // priority evaluations. 159 b.l.Lock() 160 defer b.l.Unlock() 161 for eval, token := range evals { 162 b.processEnqueue(eval, token) 163 } 164 } 165 166 // processEnqueue deduplicates evals and either enqueue immediately or enforce 167 // the evals wait time. If the token is passed, and the evaluation ID is 168 // outstanding, the evaluation is blocked til an Ack/Nack is received. 169 // processEnqueue must be called with the lock held. 170 func (b *EvalBroker) processEnqueue(eval *structs.Evaluation, token string) { 171 // Check if already enqueued 172 if _, ok := b.evals[eval.ID]; ok { 173 if token == "" { 174 return 175 } 176 177 // If the token has been passed, the evaluation is being reblocked by 178 // the scheduler and should be processed once the outstanding evaluation 179 // is Acked or Nacked. 180 if unack, ok := b.unack[eval.ID]; ok && unack.Token == token { 181 b.requeue[token] = eval 182 } 183 return 184 } else if b.enabled { 185 b.evals[eval.ID] = 0 186 } 187 188 // Check if we need to enforce a wait 189 if eval.Wait > 0 { 190 timer := time.AfterFunc(eval.Wait, func() { 191 b.enqueueWaiting(eval) 192 }) 193 b.timeWait[eval.ID] = timer 194 b.stats.TotalWaiting += 1 195 return 196 } 197 198 b.enqueueLocked(eval, eval.Type) 199 } 200 201 // enqueueWaiting is used to enqueue a waiting evaluation 202 func (b *EvalBroker) enqueueWaiting(eval *structs.Evaluation) { 203 b.l.Lock() 204 defer b.l.Unlock() 205 delete(b.timeWait, eval.ID) 206 b.stats.TotalWaiting -= 1 207 b.enqueueLocked(eval, eval.Type) 208 } 209 210 // enqueueLocked is used to enqueue with the lock held 211 func (b *EvalBroker) enqueueLocked(eval *structs.Evaluation, queue string) { 212 // Do nothing if not enabled 213 if !b.enabled { 214 return 215 } 216 217 // Check if there is an evaluation for this JobID pending 218 pendingEval := b.jobEvals[eval.JobID] 219 if pendingEval == "" { 220 b.jobEvals[eval.JobID] = eval.ID 221 } else if pendingEval != eval.ID { 222 blocked := b.blocked[eval.JobID] 223 heap.Push(&blocked, eval) 224 b.blocked[eval.JobID] = blocked 225 b.stats.TotalBlocked += 1 226 return 227 } 228 229 // Find the pending by scheduler class 230 pending, ok := b.ready[queue] 231 if !ok { 232 pending = make([]*structs.Evaluation, 0, 16) 233 if _, ok := b.waiting[queue]; !ok { 234 b.waiting[queue] = make(chan struct{}, 1) 235 } 236 } 237 238 // Push onto the heap 239 heap.Push(&pending, eval) 240 b.ready[queue] = pending 241 242 // Update the stats 243 b.stats.TotalReady += 1 244 bySched, ok := b.stats.ByScheduler[queue] 245 if !ok { 246 bySched = &SchedulerStats{} 247 b.stats.ByScheduler[queue] = bySched 248 } 249 bySched.Ready += 1 250 251 // Unblock any blocked dequeues 252 select { 253 case b.waiting[queue] <- struct{}{}: 254 default: 255 } 256 } 257 258 // Dequeue is used to perform a blocking dequeue 259 func (b *EvalBroker) Dequeue(schedulers []string, timeout time.Duration) (*structs.Evaluation, string, error) { 260 var timeoutTimer *time.Timer 261 var timeoutCh <-chan time.Time 262 SCAN: 263 // Scan for work 264 eval, token, err := b.scanForSchedulers(schedulers) 265 if err != nil { 266 if timeoutTimer != nil { 267 timeoutTimer.Stop() 268 } 269 return nil, "", err 270 } 271 272 // Check if we have something 273 if eval != nil { 274 if timeoutTimer != nil { 275 timeoutTimer.Stop() 276 } 277 return eval, token, nil 278 } 279 280 // Setup the timeout channel the first time around 281 if timeoutTimer == nil && timeout != 0 { 282 timeoutTimer = time.NewTimer(timeout) 283 timeoutCh = timeoutTimer.C 284 } 285 286 // Block until we get work 287 scan := b.waitForSchedulers(schedulers, timeoutCh) 288 if scan { 289 goto SCAN 290 } 291 return nil, "", nil 292 } 293 294 // scanForSchedulers scans for work on any of the schedulers. The highest priority work 295 // is dequeued first. This may return nothing if there is no work waiting. 296 func (b *EvalBroker) scanForSchedulers(schedulers []string) (*structs.Evaluation, string, error) { 297 b.l.Lock() 298 defer b.l.Unlock() 299 300 // Do nothing if not enabled 301 if !b.enabled { 302 return nil, "", fmt.Errorf("eval broker disabled") 303 } 304 305 // Scan for eligible work 306 var eligibleSched []string 307 var eligiblePriority int 308 for _, sched := range schedulers { 309 // Get the pending queue 310 pending, ok := b.ready[sched] 311 if !ok { 312 continue 313 } 314 315 // Peek at the next item 316 ready := pending.Peek() 317 if ready == nil { 318 continue 319 } 320 321 // Add to eligible if equal or greater priority 322 if len(eligibleSched) == 0 || ready.Priority > eligiblePriority { 323 eligibleSched = []string{sched} 324 eligiblePriority = ready.Priority 325 326 } else if eligiblePriority > ready.Priority { 327 continue 328 329 } else if eligiblePriority == ready.Priority { 330 eligibleSched = append(eligibleSched, sched) 331 } 332 } 333 334 // Determine behavior based on eligible work 335 switch n := len(eligibleSched); n { 336 case 0: 337 // No work to do! 338 return nil, "", nil 339 340 case 1: 341 // Only a single task, dequeue 342 return b.dequeueForSched(eligibleSched[0]) 343 344 default: 345 // Multiple tasks. We pick a random task so that we fairly 346 // distribute work. 347 offset := rand.Intn(n) 348 return b.dequeueForSched(eligibleSched[offset]) 349 } 350 } 351 352 // dequeueForSched is used to dequeue the next work item for a given scheduler. 353 // This assumes locks are held and that this scheduler has work 354 func (b *EvalBroker) dequeueForSched(sched string) (*structs.Evaluation, string, error) { 355 // Get the pending queue 356 pending := b.ready[sched] 357 raw := heap.Pop(&pending) 358 b.ready[sched] = pending 359 eval := raw.(*structs.Evaluation) 360 361 // Generate a UUID for the token 362 token := structs.GenerateUUID() 363 364 // Setup Nack timer 365 nackTimer := time.AfterFunc(b.nackTimeout, func() { 366 b.Nack(eval.ID, token) 367 }) 368 369 // Add to the unack queue 370 b.unack[eval.ID] = &unackEval{ 371 Eval: eval, 372 Token: token, 373 NackTimer: nackTimer, 374 } 375 376 // Increment the dequeue count 377 b.evals[eval.ID] += 1 378 379 // Update the stats 380 b.stats.TotalReady -= 1 381 b.stats.TotalUnacked += 1 382 bySched := b.stats.ByScheduler[sched] 383 bySched.Ready -= 1 384 bySched.Unacked += 1 385 386 return eval, token, nil 387 } 388 389 // waitForSchedulers is used to wait for work on any of the scheduler or until a timeout. 390 // Returns if there is work waiting potentially. 391 func (b *EvalBroker) waitForSchedulers(schedulers []string, timeoutCh <-chan time.Time) bool { 392 doneCh := make(chan struct{}) 393 readyCh := make(chan struct{}, 1) 394 defer close(doneCh) 395 396 // Start all the watchers 397 b.l.Lock() 398 for _, sched := range schedulers { 399 waitCh, ok := b.waiting[sched] 400 if !ok { 401 waitCh = make(chan struct{}, 1) 402 b.waiting[sched] = waitCh 403 } 404 405 // Start a goroutine that either waits for the waitCh on this scheduler 406 // to unblock or for this waitForSchedulers call to return 407 go func() { 408 select { 409 case <-waitCh: 410 select { 411 case readyCh <- struct{}{}: 412 default: 413 } 414 case <-doneCh: 415 } 416 }() 417 } 418 b.l.Unlock() 419 420 // Block until we have ready work and should scan, or until we timeout 421 // and should not make an attempt to scan for work 422 select { 423 case <-readyCh: 424 return true 425 case <-timeoutCh: 426 return false 427 } 428 } 429 430 // Outstanding checks if an EvalID has been delivered but not acknowledged 431 // and returns the associated token for the evaluation. 432 func (b *EvalBroker) Outstanding(evalID string) (string, bool) { 433 b.l.RLock() 434 defer b.l.RUnlock() 435 unack, ok := b.unack[evalID] 436 if !ok { 437 return "", false 438 } 439 return unack.Token, true 440 } 441 442 // OutstandingReset resets the Nack timer for the EvalID if the 443 // token matches and the eval is outstanding 444 func (b *EvalBroker) OutstandingReset(evalID, token string) error { 445 b.l.RLock() 446 defer b.l.RUnlock() 447 unack, ok := b.unack[evalID] 448 if !ok { 449 return ErrNotOutstanding 450 } 451 if unack.Token != token { 452 return ErrTokenMismatch 453 } 454 if !unack.NackTimer.Reset(b.nackTimeout) { 455 return ErrNackTimeoutReached 456 } 457 return nil 458 } 459 460 // Ack is used to positively acknowledge handling an evaluation 461 func (b *EvalBroker) Ack(evalID, token string) error { 462 b.l.Lock() 463 defer b.l.Unlock() 464 465 // Always delete the requeued evaluation. Either the Ack is successful and 466 // we requeue it or it isn't and we want to remove it. 467 defer delete(b.requeue, token) 468 469 // Lookup the unack'd eval 470 unack, ok := b.unack[evalID] 471 if !ok { 472 return fmt.Errorf("Evaluation ID not found") 473 } 474 if unack.Token != token { 475 return fmt.Errorf("Token does not match for Evaluation ID") 476 } 477 jobID := unack.Eval.JobID 478 479 // Ensure we were able to stop the timer 480 if !unack.NackTimer.Stop() { 481 return fmt.Errorf("Evaluation ID Ack'd after Nack timer expiration") 482 } 483 484 // Update the stats 485 b.stats.TotalUnacked -= 1 486 queue := unack.Eval.Type 487 if b.evals[evalID] > b.deliveryLimit { 488 queue = failedQueue 489 } 490 bySched := b.stats.ByScheduler[queue] 491 bySched.Unacked -= 1 492 493 // Cleanup 494 delete(b.unack, evalID) 495 delete(b.evals, evalID) 496 delete(b.jobEvals, jobID) 497 498 // Check if there are any blocked evaluations 499 if blocked := b.blocked[jobID]; len(blocked) != 0 { 500 raw := heap.Pop(&blocked) 501 if len(blocked) > 0 { 502 b.blocked[jobID] = blocked 503 } else { 504 delete(b.blocked, jobID) 505 } 506 eval := raw.(*structs.Evaluation) 507 b.stats.TotalBlocked -= 1 508 b.enqueueLocked(eval, eval.Type) 509 } 510 511 // Re-enqueue the evaluation. 512 if eval, ok := b.requeue[token]; ok { 513 b.processEnqueue(eval, "") 514 } 515 516 return nil 517 } 518 519 // Nack is used to negatively acknowledge handling an evaluation 520 func (b *EvalBroker) Nack(evalID, token string) error { 521 b.l.Lock() 522 defer b.l.Unlock() 523 524 // Always delete the requeued evaluation since the Nack means the requeue is 525 // invalid. 526 delete(b.requeue, token) 527 528 // Lookup the unack'd eval 529 unack, ok := b.unack[evalID] 530 if !ok { 531 return fmt.Errorf("Evaluation ID not found") 532 } 533 if unack.Token != token { 534 return fmt.Errorf("Token does not match for Evaluation ID") 535 } 536 537 // Stop the timer, doesn't matter if we've missed it 538 unack.NackTimer.Stop() 539 540 // Cleanup 541 delete(b.unack, evalID) 542 543 // Update the stats 544 b.stats.TotalUnacked -= 1 545 bySched := b.stats.ByScheduler[unack.Eval.Type] 546 bySched.Unacked -= 1 547 548 // Check if we've hit the delivery limit, and re-enqueue 549 // in the failedQueue 550 if b.evals[evalID] >= b.deliveryLimit { 551 b.enqueueLocked(unack.Eval, failedQueue) 552 } else { 553 b.enqueueLocked(unack.Eval, unack.Eval.Type) 554 } 555 return nil 556 } 557 558 // PauseNackTimeout is used to pause the Nack timeout for an eval that is making 559 // progress but is in a potentially unbounded operation such as the plan queue. 560 func (b *EvalBroker) PauseNackTimeout(evalID, token string) error { 561 b.l.RLock() 562 defer b.l.RUnlock() 563 unack, ok := b.unack[evalID] 564 if !ok { 565 return ErrNotOutstanding 566 } 567 if unack.Token != token { 568 return ErrTokenMismatch 569 } 570 if !unack.NackTimer.Stop() { 571 return ErrNackTimeoutReached 572 } 573 return nil 574 } 575 576 // ResumeNackTimeout is used to resume the Nack timeout for an eval that was 577 // paused. It should be resumed after leaving an unbounded operation. 578 func (b *EvalBroker) ResumeNackTimeout(evalID, token string) error { 579 b.l.Lock() 580 defer b.l.Unlock() 581 unack, ok := b.unack[evalID] 582 if !ok { 583 return ErrNotOutstanding 584 } 585 if unack.Token != token { 586 return ErrTokenMismatch 587 } 588 unack.NackTimer.Reset(b.nackTimeout) 589 return nil 590 } 591 592 // Flush is used to clear the state of the broker 593 func (b *EvalBroker) Flush() { 594 b.l.Lock() 595 defer b.l.Unlock() 596 597 // Unblock any waiters 598 for _, waitCh := range b.waiting { 599 close(waitCh) 600 } 601 b.waiting = make(map[string]chan struct{}) 602 603 // Cancel any Nack timers 604 for _, unack := range b.unack { 605 unack.NackTimer.Stop() 606 } 607 608 // Cancel any time wait evals 609 for _, wait := range b.timeWait { 610 wait.Stop() 611 } 612 613 // Reset the broker 614 b.stats.TotalReady = 0 615 b.stats.TotalUnacked = 0 616 b.stats.TotalBlocked = 0 617 b.stats.TotalWaiting = 0 618 b.stats.ByScheduler = make(map[string]*SchedulerStats) 619 b.evals = make(map[string]int) 620 b.jobEvals = make(map[string]string) 621 b.blocked = make(map[string]PendingEvaluations) 622 b.ready = make(map[string]PendingEvaluations) 623 b.unack = make(map[string]*unackEval) 624 b.timeWait = make(map[string]*time.Timer) 625 } 626 627 // Stats is used to query the state of the broker 628 func (b *EvalBroker) Stats() *BrokerStats { 629 // Allocate a new stats struct 630 stats := new(BrokerStats) 631 stats.ByScheduler = make(map[string]*SchedulerStats) 632 633 b.l.RLock() 634 defer b.l.RUnlock() 635 636 // Copy all the stats 637 stats.TotalReady = b.stats.TotalReady 638 stats.TotalUnacked = b.stats.TotalUnacked 639 stats.TotalBlocked = b.stats.TotalBlocked 640 stats.TotalWaiting = b.stats.TotalWaiting 641 for sched, subStat := range b.stats.ByScheduler { 642 subStatCopy := new(SchedulerStats) 643 *subStatCopy = *subStat 644 stats.ByScheduler[sched] = subStatCopy 645 } 646 return stats 647 } 648 649 // EmitStats is used to export metrics about the broker while enabled 650 func (b *EvalBroker) EmitStats(period time.Duration, stopCh chan struct{}) { 651 for { 652 select { 653 case <-time.After(period): 654 stats := b.Stats() 655 metrics.SetGauge([]string{"nomad", "broker", "total_ready"}, float32(stats.TotalReady)) 656 metrics.SetGauge([]string{"nomad", "broker", "total_unacked"}, float32(stats.TotalUnacked)) 657 metrics.SetGauge([]string{"nomad", "broker", "total_blocked"}, float32(stats.TotalBlocked)) 658 metrics.SetGauge([]string{"nomad", "broker", "total_waiting"}, float32(stats.TotalWaiting)) 659 for sched, schedStats := range stats.ByScheduler { 660 metrics.SetGauge([]string{"nomad", "broker", sched, "ready"}, float32(schedStats.Ready)) 661 metrics.SetGauge([]string{"nomad", "broker", sched, "unacked"}, float32(schedStats.Unacked)) 662 } 663 664 case <-stopCh: 665 return 666 } 667 } 668 } 669 670 // BrokerStats returns all the stats about the broker 671 type BrokerStats struct { 672 TotalReady int 673 TotalUnacked int 674 TotalBlocked int 675 TotalWaiting int 676 ByScheduler map[string]*SchedulerStats 677 } 678 679 // SchedulerStats returns the stats per scheduler 680 type SchedulerStats struct { 681 Ready int 682 Unacked int 683 } 684 685 // Len is for the sorting interface 686 func (p PendingEvaluations) Len() int { 687 return len(p) 688 } 689 690 // Less is for the sorting interface. We flip the check 691 // so that the "min" in the min-heap is the element with the 692 // highest priority 693 func (p PendingEvaluations) Less(i, j int) bool { 694 if p[i].JobID != p[j].JobID && p[i].Priority != p[j].Priority { 695 return !(p[i].Priority < p[j].Priority) 696 } 697 return p[i].CreateIndex < p[j].CreateIndex 698 } 699 700 // Swap is for the sorting interface 701 func (p PendingEvaluations) Swap(i, j int) { 702 p[i], p[j] = p[j], p[i] 703 } 704 705 // Push is used to add a new evalution to the slice 706 func (p *PendingEvaluations) Push(e interface{}) { 707 *p = append(*p, e.(*structs.Evaluation)) 708 } 709 710 // Pop is used to remove an evaluation from the slice 711 func (p *PendingEvaluations) Pop() interface{} { 712 n := len(*p) 713 e := (*p)[n-1] 714 (*p)[n-1] = nil 715 *p = (*p)[:n-1] 716 return e 717 } 718 719 // Peek is used to peek at the next element that would be popped 720 func (p PendingEvaluations) Peek() *structs.Evaluation { 721 n := len(p) 722 if n == 0 { 723 return nil 724 } 725 return p[n-1] 726 }