github.com/ryanslade/nomad@v0.2.4-0.20160128061903-fc95782f2089/nomad/eval_broker.go (about) 1 package nomad 2 3 import ( 4 "container/heap" 5 "errors" 6 "fmt" 7 "math/rand" 8 "sync" 9 "time" 10 11 "github.com/armon/go-metrics" 12 "github.com/hashicorp/nomad/nomad/structs" 13 ) 14 15 const ( 16 // failedQueue is the queue we add Evaluations to once 17 // they've reached the deliveryLimit. This allows the leader to 18 // set the status to failed. 19 failedQueue = "_failed" 20 ) 21 22 var ( 23 // ErrNotOutstanding is returned if an evaluation is not outstanding 24 ErrNotOutstanding = errors.New("evaluation is not outstanding") 25 26 // ErrTokenMismatch is the outstanding eval has a different token 27 ErrTokenMismatch = errors.New("evaluation token does not match") 28 29 // ErrNackTimeoutReached is returned if an expired evaluation is reset 30 ErrNackTimeoutReached = errors.New("evaluation nack timeout reached") 31 ) 32 33 // EvalBroker is used to manage brokering of evaluations. When an evaluation is 34 // created, due to a change in a job specification or a node, we put it into the 35 // broker. The broker sorts by evaluations by priority and scheduler type. This 36 // allows us to dequeue the highest priority work first, while also allowing sub-schedulers 37 // to only dequeue work they know how to handle. The broker is designed to be entirely 38 // in-memory and is managed by the leader node. 39 // 40 // The broker must provide at-least-once delivery semantics. It relies on explicit 41 // Ack/Nack messages to handle this. If a delivery is not Ack'd in a sufficient time 42 // span, it will be assumed Nack'd. 43 type EvalBroker struct { 44 nackTimeout time.Duration 45 deliveryLimit int 46 47 enabled bool 48 stats *BrokerStats 49 50 // evals tracks queued evaluations by ID to de-duplicate enqueue. 51 // The counter is the number of times we've attempted delivery, 52 // and is used to eventually fail an evaluation. 53 evals map[string]int 54 55 // jobEvals tracks queued evaluations by JobID to serialize them 56 jobEvals map[string]string 57 58 // blocked tracks the blocked evaluations by JobID in a priority queue 59 blocked map[string]PendingEvaluations 60 61 // ready tracks the ready jobs by scheduler in a priority queue 62 ready map[string]PendingEvaluations 63 64 // unack is a map of evalID to an un-acknowledged evaluation 65 unack map[string]*unackEval 66 67 // waiting is used to notify on a per-scheduler basis of ready work 68 waiting map[string]chan struct{} 69 70 // timeWait has evaluations that are waiting for time to elapse 71 timeWait map[string]*time.Timer 72 73 l sync.RWMutex 74 } 75 76 // unackEval tracks an unacknowledged evaluation along with the Nack timer 77 type unackEval struct { 78 Eval *structs.Evaluation 79 Token string 80 NackTimer *time.Timer 81 } 82 83 // PendingEvaluations is a list of waiting evaluations. 84 // We implement the container/heap interface so that this is a 85 // priority queue 86 type PendingEvaluations []*structs.Evaluation 87 88 // NewEvalBroker creates a new evaluation broker. This is parameterized 89 // with the timeout used for messages that are not acknowledged before we 90 // assume a Nack and attempt to redeliver as well as the deliveryLimit 91 // which prevents a failing eval from being endlessly delivered. 92 func NewEvalBroker(timeout time.Duration, deliveryLimit int) (*EvalBroker, error) { 93 if timeout < 0 { 94 return nil, fmt.Errorf("timeout cannot be negative") 95 } 96 b := &EvalBroker{ 97 nackTimeout: timeout, 98 deliveryLimit: deliveryLimit, 99 enabled: false, 100 stats: new(BrokerStats), 101 evals: make(map[string]int), 102 jobEvals: make(map[string]string), 103 blocked: make(map[string]PendingEvaluations), 104 ready: make(map[string]PendingEvaluations), 105 unack: make(map[string]*unackEval), 106 waiting: make(map[string]chan struct{}), 107 timeWait: make(map[string]*time.Timer), 108 } 109 b.stats.ByScheduler = make(map[string]*SchedulerStats) 110 return b, nil 111 } 112 113 // Enabled is used to check if the broker is enabled. 114 func (b *EvalBroker) Enabled() bool { 115 b.l.RLock() 116 defer b.l.RUnlock() 117 return b.enabled 118 } 119 120 // SetEnabled is used to control if the broker is enabled. The broker 121 // should only be enabled on the active leader. 122 func (b *EvalBroker) SetEnabled(enabled bool) { 123 b.l.Lock() 124 b.enabled = enabled 125 b.l.Unlock() 126 if !enabled { 127 b.Flush() 128 } 129 } 130 131 // Enqueue is used to enqueue an evaluation 132 func (b *EvalBroker) Enqueue(eval *structs.Evaluation) error { 133 b.l.Lock() 134 defer b.l.Unlock() 135 136 // Check if already enqueued 137 if _, ok := b.evals[eval.ID]; ok { 138 return nil 139 } else if b.enabled { 140 b.evals[eval.ID] = 0 141 } 142 143 // Check if we need to enforce a wait 144 if eval.Wait > 0 { 145 timer := time.AfterFunc(eval.Wait, func() { 146 b.enqueueWaiting(eval) 147 }) 148 b.timeWait[eval.ID] = timer 149 b.stats.TotalWaiting += 1 150 return nil 151 } 152 153 b.enqueueLocked(eval, eval.Type) 154 return nil 155 } 156 157 // enqueueWaiting is used to enqueue a waiting evaluation 158 func (b *EvalBroker) enqueueWaiting(eval *structs.Evaluation) { 159 b.l.Lock() 160 defer b.l.Unlock() 161 delete(b.timeWait, eval.ID) 162 b.stats.TotalWaiting -= 1 163 b.enqueueLocked(eval, eval.Type) 164 } 165 166 // enqueueLocked is used to enqueue with the lock held 167 func (b *EvalBroker) enqueueLocked(eval *structs.Evaluation, queue string) { 168 // Do nothing if not enabled 169 if !b.enabled { 170 return 171 } 172 173 // Check if there is an evaluation for this JobID pending 174 pendingEval := b.jobEvals[eval.JobID] 175 if pendingEval == "" { 176 b.jobEvals[eval.JobID] = eval.ID 177 } else if pendingEval != eval.ID { 178 blocked := b.blocked[eval.JobID] 179 heap.Push(&blocked, eval) 180 b.blocked[eval.JobID] = blocked 181 b.stats.TotalBlocked += 1 182 return 183 } 184 185 // Find the pending by scheduler class 186 pending, ok := b.ready[queue] 187 if !ok { 188 pending = make([]*structs.Evaluation, 0, 16) 189 if _, ok := b.waiting[queue]; !ok { 190 b.waiting[queue] = make(chan struct{}, 1) 191 } 192 } 193 194 // Push onto the heap 195 heap.Push(&pending, eval) 196 b.ready[queue] = pending 197 198 // Update the stats 199 b.stats.TotalReady += 1 200 bySched, ok := b.stats.ByScheduler[queue] 201 if !ok { 202 bySched = &SchedulerStats{} 203 b.stats.ByScheduler[queue] = bySched 204 } 205 bySched.Ready += 1 206 207 // Unblock any blocked dequeues 208 select { 209 case b.waiting[queue] <- struct{}{}: 210 default: 211 } 212 } 213 214 // Dequeue is used to perform a blocking dequeue 215 func (b *EvalBroker) Dequeue(schedulers []string, timeout time.Duration) (*structs.Evaluation, string, error) { 216 var timeoutTimer *time.Timer 217 var timeoutCh <-chan time.Time 218 SCAN: 219 // Scan for work 220 eval, token, err := b.scanForSchedulers(schedulers) 221 if err != nil { 222 if timeoutTimer != nil { 223 timeoutTimer.Stop() 224 } 225 return nil, "", err 226 } 227 228 // Check if we have something 229 if eval != nil { 230 if timeoutTimer != nil { 231 timeoutTimer.Stop() 232 } 233 return eval, token, nil 234 } 235 236 // Setup the timeout channel the first time around 237 if timeoutTimer == nil && timeout != 0 { 238 timeoutTimer = time.NewTimer(timeout) 239 timeoutCh = timeoutTimer.C 240 } 241 242 // Block until we get work 243 scan := b.waitForSchedulers(schedulers, timeoutCh) 244 if scan { 245 goto SCAN 246 } 247 return nil, "", nil 248 } 249 250 // scanForSchedulers scans for work on any of the schedulers. The highest priority work 251 // is dequeued first. This may return nothing if there is no work waiting. 252 func (b *EvalBroker) scanForSchedulers(schedulers []string) (*structs.Evaluation, string, error) { 253 b.l.Lock() 254 defer b.l.Unlock() 255 256 // Do nothing if not enabled 257 if !b.enabled { 258 return nil, "", fmt.Errorf("eval broker disabled") 259 } 260 261 // Scan for eligible work 262 var eligibleSched []string 263 var eligiblePriority int 264 for _, sched := range schedulers { 265 // Get the pending queue 266 pending, ok := b.ready[sched] 267 if !ok { 268 continue 269 } 270 271 // Peek at the next item 272 ready := pending.Peek() 273 if ready == nil { 274 continue 275 } 276 277 // Add to eligible if equal or greater priority 278 if len(eligibleSched) == 0 || ready.Priority > eligiblePriority { 279 eligibleSched = []string{sched} 280 eligiblePriority = ready.Priority 281 282 } else if eligiblePriority > ready.Priority { 283 continue 284 285 } else if eligiblePriority == ready.Priority { 286 eligibleSched = append(eligibleSched, sched) 287 } 288 } 289 290 // Determine behavior based on eligible work 291 switch n := len(eligibleSched); n { 292 case 0: 293 // No work to do! 294 return nil, "", nil 295 296 case 1: 297 // Only a single task, dequeue 298 return b.dequeueForSched(eligibleSched[0]) 299 300 default: 301 // Multiple tasks. We pick a random task so that we fairly 302 // distribute work. 303 offset := rand.Int63() % int64(n) 304 return b.dequeueForSched(eligibleSched[offset]) 305 } 306 } 307 308 // dequeueForSched is used to dequeue the next work item for a given scheduler. 309 // This assumes locks are held and that this scheduler has work 310 func (b *EvalBroker) dequeueForSched(sched string) (*structs.Evaluation, string, error) { 311 // Get the pending queue 312 pending := b.ready[sched] 313 raw := heap.Pop(&pending) 314 b.ready[sched] = pending 315 eval := raw.(*structs.Evaluation) 316 317 // Generate a UUID for the token 318 token := structs.GenerateUUID() 319 320 // Setup Nack timer 321 nackTimer := time.AfterFunc(b.nackTimeout, func() { 322 b.Nack(eval.ID, token) 323 }) 324 325 // Add to the unack queue 326 b.unack[eval.ID] = &unackEval{ 327 Eval: eval, 328 Token: token, 329 NackTimer: nackTimer, 330 } 331 332 // Increment the dequeue count 333 b.evals[eval.ID] += 1 334 335 // Update the stats 336 b.stats.TotalReady -= 1 337 b.stats.TotalUnacked += 1 338 bySched := b.stats.ByScheduler[sched] 339 bySched.Ready -= 1 340 bySched.Unacked += 1 341 342 return eval, token, nil 343 } 344 345 // waitForSchedulers is used to wait for work on any of the scheduler or until a timeout. 346 // Returns if there is work waiting potentially. 347 func (b *EvalBroker) waitForSchedulers(schedulers []string, timeoutCh <-chan time.Time) bool { 348 doneCh := make(chan struct{}) 349 readyCh := make(chan struct{}, 1) 350 defer close(doneCh) 351 352 // Start all the watchers 353 b.l.Lock() 354 for _, sched := range schedulers { 355 waitCh, ok := b.waiting[sched] 356 if !ok { 357 waitCh = make(chan struct{}, 1) 358 b.waiting[sched] = waitCh 359 } 360 361 // Start a goroutine that either waits for the waitCh on this scheduler 362 // to unblock or for this waitForSchedulers call to return 363 go func() { 364 select { 365 case <-waitCh: 366 select { 367 case readyCh <- struct{}{}: 368 default: 369 } 370 case <-doneCh: 371 } 372 }() 373 } 374 b.l.Unlock() 375 376 // Block until we have ready work and should scan, or until we timeout 377 // and should not make an attempt to scan for work 378 select { 379 case <-readyCh: 380 return true 381 case <-timeoutCh: 382 return false 383 } 384 } 385 386 // Outstanding checks if an EvalID has been delivered but not acknowledged 387 // and returns the associated token for the evaluation. 388 func (b *EvalBroker) Outstanding(evalID string) (string, bool) { 389 b.l.RLock() 390 defer b.l.RUnlock() 391 unack, ok := b.unack[evalID] 392 if !ok { 393 return "", false 394 } 395 return unack.Token, true 396 } 397 398 // OutstandingReset resets the Nack timer for the EvalID if the 399 // token matches and the eval is outstanding 400 func (b *EvalBroker) OutstandingReset(evalID, token string) error { 401 b.l.RLock() 402 defer b.l.RUnlock() 403 unack, ok := b.unack[evalID] 404 if !ok { 405 return ErrNotOutstanding 406 } 407 if unack.Token != token { 408 return ErrTokenMismatch 409 } 410 if !unack.NackTimer.Reset(b.nackTimeout) { 411 return ErrNackTimeoutReached 412 } 413 return nil 414 } 415 416 // Ack is used to positively acknowledge handling an evaluation 417 func (b *EvalBroker) Ack(evalID, token string) error { 418 b.l.Lock() 419 defer b.l.Unlock() 420 421 // Lookup the unack'd eval 422 unack, ok := b.unack[evalID] 423 if !ok { 424 return fmt.Errorf("Evaluation ID not found") 425 } 426 if unack.Token != token { 427 return fmt.Errorf("Token does not match for Evaluation ID") 428 } 429 jobID := unack.Eval.JobID 430 431 // Ensure we were able to stop the timer 432 if !unack.NackTimer.Stop() { 433 return fmt.Errorf("Evaluation ID Ack'd after Nack timer expiration") 434 } 435 436 // Update the stats 437 b.stats.TotalUnacked -= 1 438 queue := unack.Eval.Type 439 if b.evals[evalID] >= b.deliveryLimit { 440 queue = failedQueue 441 } 442 bySched := b.stats.ByScheduler[queue] 443 bySched.Unacked -= 1 444 445 // Cleanup 446 delete(b.unack, evalID) 447 delete(b.evals, evalID) 448 delete(b.jobEvals, jobID) 449 450 // Check if there are any blocked evaluations 451 if blocked := b.blocked[jobID]; len(blocked) != 0 { 452 raw := heap.Pop(&blocked) 453 if len(blocked) > 0 { 454 b.blocked[jobID] = blocked 455 } else { 456 delete(b.blocked, jobID) 457 } 458 eval := raw.(*structs.Evaluation) 459 b.stats.TotalBlocked -= 1 460 b.enqueueLocked(eval, eval.Type) 461 return nil 462 } 463 return nil 464 } 465 466 // Nack is used to negatively acknowledge handling an evaluation 467 func (b *EvalBroker) Nack(evalID, token string) error { 468 b.l.Lock() 469 defer b.l.Unlock() 470 471 // Lookup the unack'd eval 472 unack, ok := b.unack[evalID] 473 if !ok { 474 return fmt.Errorf("Evaluation ID not found") 475 } 476 if unack.Token != token { 477 return fmt.Errorf("Token does not match for Evaluation ID") 478 } 479 480 // Stop the timer, doesn't matter if we've missed it 481 unack.NackTimer.Stop() 482 483 // Cleanup 484 delete(b.unack, evalID) 485 486 // Update the stats 487 b.stats.TotalUnacked -= 1 488 bySched := b.stats.ByScheduler[unack.Eval.Type] 489 bySched.Unacked -= 1 490 491 // Check if we've hit the delivery limit, and re-enqueue 492 // in the failedQueue 493 if b.evals[evalID] >= b.deliveryLimit { 494 b.enqueueLocked(unack.Eval, failedQueue) 495 } else { 496 b.enqueueLocked(unack.Eval, unack.Eval.Type) 497 } 498 return nil 499 } 500 501 // Flush is used to clear the state of the broker 502 func (b *EvalBroker) Flush() { 503 b.l.Lock() 504 defer b.l.Unlock() 505 506 // Unblock any waiters 507 for _, waitCh := range b.waiting { 508 close(waitCh) 509 } 510 b.waiting = make(map[string]chan struct{}) 511 512 // Cancel any Nack timers 513 for _, unack := range b.unack { 514 unack.NackTimer.Stop() 515 } 516 517 // Cancel any time wait evals 518 for _, wait := range b.timeWait { 519 wait.Stop() 520 } 521 522 // Reset the broker 523 b.stats.TotalReady = 0 524 b.stats.TotalUnacked = 0 525 b.stats.TotalBlocked = 0 526 b.stats.TotalWaiting = 0 527 b.stats.ByScheduler = make(map[string]*SchedulerStats) 528 b.evals = make(map[string]int) 529 b.jobEvals = make(map[string]string) 530 b.blocked = make(map[string]PendingEvaluations) 531 b.ready = make(map[string]PendingEvaluations) 532 b.unack = make(map[string]*unackEval) 533 b.timeWait = make(map[string]*time.Timer) 534 } 535 536 // Stats is used to query the state of the broker 537 func (b *EvalBroker) Stats() *BrokerStats { 538 // Allocate a new stats struct 539 stats := new(BrokerStats) 540 stats.ByScheduler = make(map[string]*SchedulerStats) 541 542 b.l.RLock() 543 defer b.l.RUnlock() 544 545 // Copy all the stats 546 stats.TotalReady = b.stats.TotalReady 547 stats.TotalUnacked = b.stats.TotalUnacked 548 stats.TotalBlocked = b.stats.TotalBlocked 549 stats.TotalWaiting = b.stats.TotalWaiting 550 for sched, subStat := range b.stats.ByScheduler { 551 subStatCopy := new(SchedulerStats) 552 *subStatCopy = *subStat 553 stats.ByScheduler[sched] = subStatCopy 554 } 555 return stats 556 } 557 558 // EmitStats is used to export metrics about the broker while enabled 559 func (b *EvalBroker) EmitStats(period time.Duration, stopCh chan struct{}) { 560 for { 561 select { 562 case <-time.After(period): 563 stats := b.Stats() 564 metrics.SetGauge([]string{"nomad", "broker", "total_ready"}, float32(stats.TotalReady)) 565 metrics.SetGauge([]string{"nomad", "broker", "total_unacked"}, float32(stats.TotalUnacked)) 566 metrics.SetGauge([]string{"nomad", "broker", "total_blocked"}, float32(stats.TotalBlocked)) 567 metrics.SetGauge([]string{"nomad", "broker", "total_waiting"}, float32(stats.TotalWaiting)) 568 for sched, schedStats := range stats.ByScheduler { 569 metrics.SetGauge([]string{"nomad", "broker", sched, "ready"}, float32(schedStats.Ready)) 570 metrics.SetGauge([]string{"nomad", "broker", sched, "unacked"}, float32(schedStats.Unacked)) 571 } 572 573 case <-stopCh: 574 return 575 } 576 } 577 } 578 579 // BrokerStats returns all the stats about the broker 580 type BrokerStats struct { 581 TotalReady int 582 TotalUnacked int 583 TotalBlocked int 584 TotalWaiting int 585 ByScheduler map[string]*SchedulerStats 586 } 587 588 // SchedulerStats returns the stats per scheduler 589 type SchedulerStats struct { 590 Ready int 591 Unacked int 592 } 593 594 // Len is for the sorting interface 595 func (p PendingEvaluations) Len() int { 596 return len(p) 597 } 598 599 // Less is for the sorting interface. We flip the check 600 // so that the "min" in the min-heap is the element with the 601 // highest priority 602 func (p PendingEvaluations) Less(i, j int) bool { 603 if p[i].JobID != p[j].JobID && p[i].Priority != p[j].Priority { 604 return !(p[i].Priority < p[j].Priority) 605 } 606 return p[i].CreateIndex < p[j].CreateIndex 607 } 608 609 // Swap is for the sorting interface 610 func (p PendingEvaluations) Swap(i, j int) { 611 p[i], p[j] = p[j], p[i] 612 } 613 614 // Push is used to add a new evalution to the slice 615 func (p *PendingEvaluations) Push(e interface{}) { 616 *p = append(*p, e.(*structs.Evaluation)) 617 } 618 619 // Pop is used to remove an evaluation from the slice 620 func (p *PendingEvaluations) Pop() interface{} { 621 n := len(*p) 622 e := (*p)[n-1] 623 (*p)[n-1] = nil 624 *p = (*p)[:n-1] 625 return e 626 } 627 628 // Peek is used to peek at the next element that would be popped 629 func (p PendingEvaluations) Peek() *structs.Evaluation { 630 n := len(p) 631 if n == 0 { 632 return nil 633 } 634 return p[n-1] 635 }