github.com/kardianos/nomad@v0.1.3-0.20151022182107-b13df73ee850/nomad/eval_broker.go (about) 1 package nomad 2 3 import ( 4 "container/heap" 5 "fmt" 6 "math/rand" 7 "sync" 8 "time" 9 10 "github.com/armon/go-metrics" 11 "github.com/hashicorp/nomad/nomad/structs" 12 ) 13 14 const ( 15 // failedQueue is the queue we add Evaluations to once 16 // they've reached the deliveryLimit. This allows the leader to 17 // set the status to failed. 18 failedQueue = "_failed" 19 ) 20 21 // EvalBroker is used to manage brokering of evaluations. When an evaluation is 22 // created, due to a change in a job specification or a node, we put it into the 23 // broker. The broker sorts by evaluations by priority and scheduler type. This 24 // allows us to dequeue the highest priority work first, while also allowing sub-schedulers 25 // to only dequeue work they know how to handle. The broker is designed to be entirely 26 // in-memory and is managed by the leader node. 27 // 28 // The broker must provide at-least-once delivery semantics. It relies on explicit 29 // Ack/Nack messages to handle this. If a delivery is not Ack'd in a sufficient time 30 // span, it will be assumed Nack'd. 31 type EvalBroker struct { 32 nackTimeout time.Duration 33 deliveryLimit int 34 35 enabled bool 36 stats *BrokerStats 37 38 // evals tracks queued evaluations by ID to de-duplicate enqueue. 39 // The counter is the number of times we've attempted delivery, 40 // and is used to eventually fail an evaluation. 41 evals map[string]int 42 43 // jobEvals tracks queued evaluations by JobID to serialize them 44 jobEvals map[string]string 45 46 // blocked tracks the blocked evaluations by JobID in a priority queue 47 blocked map[string]PendingEvaluations 48 49 // ready tracks the ready jobs by scheduler in a priority queue 50 ready map[string]PendingEvaluations 51 52 // unack is a map of evalID to an un-acknowledged evaluation 53 unack map[string]*unackEval 54 55 // waiting is used to notify on a per-scheduler basis of ready work 56 waiting map[string]chan struct{} 57 58 // timeWait has evaluations that are waiting for time to elapse 59 timeWait map[string]*time.Timer 60 61 l sync.RWMutex 62 } 63 64 // unackEval tracks an unacknowledged evaluation along with the Nack timer 65 type unackEval struct { 66 Eval *structs.Evaluation 67 Token string 68 NackTimer *time.Timer 69 } 70 71 // PendingEvaluations is a list of waiting evaluations. 72 // We implement the container/heap interface so that this is a 73 // priority queue 74 type PendingEvaluations []*structs.Evaluation 75 76 // NewEvalBroker creates a new evaluation broker. This is parameterized 77 // with the timeout used for messages that are not acknowledged before we 78 // assume a Nack and attempt to redeliver as well as the deliveryLimit 79 // which prevents a failing eval from being endlessly delivered. 80 func NewEvalBroker(timeout time.Duration, deliveryLimit int) (*EvalBroker, error) { 81 if timeout < 0 { 82 return nil, fmt.Errorf("timeout cannot be negative") 83 } 84 b := &EvalBroker{ 85 nackTimeout: timeout, 86 deliveryLimit: deliveryLimit, 87 enabled: false, 88 stats: new(BrokerStats), 89 evals: make(map[string]int), 90 jobEvals: make(map[string]string), 91 blocked: make(map[string]PendingEvaluations), 92 ready: make(map[string]PendingEvaluations), 93 unack: make(map[string]*unackEval), 94 waiting: make(map[string]chan struct{}), 95 timeWait: make(map[string]*time.Timer), 96 } 97 b.stats.ByScheduler = make(map[string]*SchedulerStats) 98 return b, nil 99 } 100 101 // Enabled is used to check if the broker is enabled. 102 func (b *EvalBroker) Enabled() bool { 103 b.l.RLock() 104 defer b.l.RUnlock() 105 return b.enabled 106 } 107 108 // SetEnabled is used to control if the broker is enabled. The broker 109 // should only be enabled on the active leader. 110 func (b *EvalBroker) SetEnabled(enabled bool) { 111 b.l.Lock() 112 b.enabled = enabled 113 b.l.Unlock() 114 if !enabled { 115 b.Flush() 116 } 117 } 118 119 // Enqueue is used to enqueue an evaluation 120 func (b *EvalBroker) Enqueue(eval *structs.Evaluation) error { 121 b.l.Lock() 122 defer b.l.Unlock() 123 124 // Check if already enqueued 125 if _, ok := b.evals[eval.ID]; ok { 126 return nil 127 } else if b.enabled { 128 b.evals[eval.ID] = 0 129 } 130 131 // Check if we need to enforce a wait 132 if eval.Wait > 0 { 133 timer := time.AfterFunc(eval.Wait, func() { 134 b.enqueueWaiting(eval) 135 }) 136 b.timeWait[eval.ID] = timer 137 b.stats.TotalWaiting += 1 138 return nil 139 } 140 141 b.enqueueLocked(eval, eval.Type) 142 return nil 143 } 144 145 // enqueueWaiting is used to enqueue a waiting evaluation 146 func (b *EvalBroker) enqueueWaiting(eval *structs.Evaluation) { 147 b.l.Lock() 148 defer b.l.Unlock() 149 delete(b.timeWait, eval.ID) 150 b.stats.TotalWaiting -= 1 151 b.enqueueLocked(eval, eval.Type) 152 } 153 154 // enqueueLocked is used to enqueue with the lock held 155 func (b *EvalBroker) enqueueLocked(eval *structs.Evaluation, queue string) { 156 // Do nothing if not enabled 157 if !b.enabled { 158 return 159 } 160 161 // Check if there is an evaluation for this JobID pending 162 pendingEval := b.jobEvals[eval.JobID] 163 if pendingEval == "" { 164 b.jobEvals[eval.JobID] = eval.ID 165 } else if pendingEval != eval.ID { 166 blocked := b.blocked[eval.JobID] 167 heap.Push(&blocked, eval) 168 b.blocked[eval.JobID] = blocked 169 b.stats.TotalBlocked += 1 170 return 171 } 172 173 // Find the pending by scheduler class 174 pending, ok := b.ready[queue] 175 if !ok { 176 pending = make([]*structs.Evaluation, 0, 16) 177 if _, ok := b.waiting[queue]; !ok { 178 b.waiting[queue] = make(chan struct{}, 1) 179 } 180 } 181 182 // Push onto the heap 183 heap.Push(&pending, eval) 184 b.ready[queue] = pending 185 186 // Update the stats 187 b.stats.TotalReady += 1 188 bySched, ok := b.stats.ByScheduler[queue] 189 if !ok { 190 bySched = &SchedulerStats{} 191 b.stats.ByScheduler[queue] = bySched 192 } 193 bySched.Ready += 1 194 195 // Unblock any blocked dequeues 196 select { 197 case b.waiting[queue] <- struct{}{}: 198 default: 199 } 200 } 201 202 // Dequeue is used to perform a blocking dequeue 203 func (b *EvalBroker) Dequeue(schedulers []string, timeout time.Duration) (*structs.Evaluation, string, error) { 204 var timeoutTimer *time.Timer 205 SCAN: 206 // Scan for work 207 eval, token, err := b.scanForSchedulers(schedulers) 208 if err != nil { 209 if timeoutTimer != nil { 210 timeoutTimer.Stop() 211 } 212 return nil, "", err 213 } 214 215 // Check if we have something 216 if eval != nil { 217 if timeoutTimer != nil { 218 timeoutTimer.Stop() 219 } 220 return eval, token, nil 221 } 222 223 // Setup the timeout channel the first time around 224 if timeoutTimer == nil && timeout != 0 { 225 timeoutTimer = time.NewTimer(timeout) 226 } 227 228 // Block until we get work 229 scan := b.waitForSchedulers(schedulers, timeoutTimer.C) 230 if scan { 231 goto SCAN 232 } 233 return nil, "", nil 234 } 235 236 // scanForSchedulers scans for work on any of the schedulers. The highest priority work 237 // is dequeued first. This may return nothing if there is no work waiting. 238 func (b *EvalBroker) scanForSchedulers(schedulers []string) (*structs.Evaluation, string, error) { 239 b.l.Lock() 240 defer b.l.Unlock() 241 242 // Do nothing if not enabled 243 if !b.enabled { 244 return nil, "", fmt.Errorf("eval broker disabled") 245 } 246 247 // Scan for eligible work 248 var eligibleSched []string 249 var eligiblePriority int 250 for _, sched := range schedulers { 251 // Get the pending queue 252 pending, ok := b.ready[sched] 253 if !ok { 254 continue 255 } 256 257 // Peek at the next item 258 ready := pending.Peek() 259 if ready == nil { 260 continue 261 } 262 263 // Add to eligible if equal or greater priority 264 if len(eligibleSched) == 0 || ready.Priority > eligiblePriority { 265 eligibleSched = []string{sched} 266 eligiblePriority = ready.Priority 267 268 } else if eligiblePriority > ready.Priority { 269 continue 270 271 } else if eligiblePriority == ready.Priority { 272 eligibleSched = append(eligibleSched, sched) 273 } 274 } 275 276 // Determine behavior based on eligible work 277 switch n := len(eligibleSched); n { 278 case 0: 279 // No work to do! 280 return nil, "", nil 281 282 case 1: 283 // Only a single task, dequeue 284 return b.dequeueForSched(eligibleSched[0]) 285 286 default: 287 // Multiple tasks. We pick a random task so that we fairly 288 // distribute work. 289 offset := rand.Int63() % int64(n) 290 return b.dequeueForSched(eligibleSched[offset]) 291 } 292 } 293 294 // dequeueForSched is used to dequeue the next work item for a given scheduler. 295 // This assumes locks are held and that this scheduler has work 296 func (b *EvalBroker) dequeueForSched(sched string) (*structs.Evaluation, string, error) { 297 // Get the pending queue 298 pending := b.ready[sched] 299 raw := heap.Pop(&pending) 300 b.ready[sched] = pending 301 eval := raw.(*structs.Evaluation) 302 303 // Generate a UUID for the token 304 token := structs.GenerateUUID() 305 306 // Setup Nack timer 307 nackTimer := time.AfterFunc(b.nackTimeout, func() { 308 b.Nack(eval.ID, token) 309 }) 310 311 // Add to the unack queue 312 b.unack[eval.ID] = &unackEval{ 313 Eval: eval, 314 Token: token, 315 NackTimer: nackTimer, 316 } 317 318 // Increment the dequeue count 319 b.evals[eval.ID] += 1 320 321 // Update the stats 322 b.stats.TotalReady -= 1 323 b.stats.TotalUnacked += 1 324 bySched := b.stats.ByScheduler[sched] 325 bySched.Ready -= 1 326 bySched.Unacked += 1 327 328 return eval, token, nil 329 } 330 331 // waitForSchedulers is used to wait for work on any of the scheduler or until a timeout. 332 // Returns if there is work waiting potentially. 333 func (b *EvalBroker) waitForSchedulers(schedulers []string, timeoutCh <-chan time.Time) bool { 334 doneCh := make(chan struct{}) 335 readyCh := make(chan struct{}, 1) 336 defer close(doneCh) 337 338 // Start all the watchers 339 b.l.Lock() 340 for _, sched := range schedulers { 341 waitCh, ok := b.waiting[sched] 342 if !ok { 343 waitCh = make(chan struct{}, 1) 344 b.waiting[sched] = waitCh 345 } 346 347 // Start a goroutine that either waits for the waitCh on this scheduler 348 // to unblock or for this waitForSchedulers call to return 349 go func() { 350 select { 351 case <-waitCh: 352 select { 353 case readyCh <- struct{}{}: 354 default: 355 } 356 case <-doneCh: 357 } 358 }() 359 } 360 b.l.Unlock() 361 362 // Block until we have ready work and should scan, or until we timeout 363 // and should not make an attempt to scan for work 364 select { 365 case <-readyCh: 366 return true 367 case <-timeoutCh: 368 return false 369 } 370 } 371 372 // Outstanding checks if an EvalID has been delivered but not acknowledged 373 // and returns the associated token for the evaluation. 374 func (b *EvalBroker) Outstanding(evalID string) (string, bool) { 375 b.l.RLock() 376 defer b.l.RUnlock() 377 unack, ok := b.unack[evalID] 378 if !ok { 379 return "", false 380 } 381 return unack.Token, true 382 } 383 384 // Ack is used to positively acknowledge handling an evaluation 385 func (b *EvalBroker) Ack(evalID, token string) error { 386 b.l.Lock() 387 defer b.l.Unlock() 388 389 // Lookup the unack'd eval 390 unack, ok := b.unack[evalID] 391 if !ok { 392 return fmt.Errorf("Evaluation ID not found") 393 } 394 if unack.Token != token { 395 return fmt.Errorf("Token does not match for Evaluation ID") 396 } 397 jobID := unack.Eval.JobID 398 399 // Ensure we were able to stop the timer 400 if !unack.NackTimer.Stop() { 401 return fmt.Errorf("Evaluation ID Ack'd after Nack timer expiration") 402 } 403 404 // Update the stats 405 b.stats.TotalUnacked -= 1 406 queue := unack.Eval.Type 407 if b.evals[evalID] >= b.deliveryLimit { 408 queue = failedQueue 409 } 410 bySched := b.stats.ByScheduler[queue] 411 bySched.Unacked -= 1 412 413 // Cleanup 414 delete(b.unack, evalID) 415 delete(b.evals, evalID) 416 delete(b.jobEvals, jobID) 417 418 // Check if there are any blocked evaluations 419 if blocked := b.blocked[jobID]; len(blocked) != 0 { 420 raw := heap.Pop(&blocked) 421 if len(blocked) > 0 { 422 b.blocked[jobID] = blocked 423 } else { 424 delete(b.blocked, jobID) 425 } 426 eval := raw.(*structs.Evaluation) 427 b.stats.TotalBlocked -= 1 428 b.enqueueLocked(eval, eval.Type) 429 return nil 430 } 431 return nil 432 } 433 434 // Nack is used to negatively acknowledge handling an evaluation 435 func (b *EvalBroker) Nack(evalID, token string) error { 436 b.l.Lock() 437 defer b.l.Unlock() 438 439 // Lookup the unack'd eval 440 unack, ok := b.unack[evalID] 441 if !ok { 442 return fmt.Errorf("Evaluation ID not found") 443 } 444 if unack.Token != token { 445 return fmt.Errorf("Token does not match for Evaluation ID") 446 } 447 448 // Stop the timer, doesn't matter if we've missed it 449 unack.NackTimer.Stop() 450 451 // Cleanup 452 delete(b.unack, evalID) 453 454 // Update the stats 455 b.stats.TotalUnacked -= 1 456 bySched := b.stats.ByScheduler[unack.Eval.Type] 457 bySched.Unacked -= 1 458 459 // Check if we've hit the delivery limit, and re-enqueue 460 // in the failedQueue 461 if b.evals[evalID] >= b.deliveryLimit { 462 b.enqueueLocked(unack.Eval, failedQueue) 463 } else { 464 b.enqueueLocked(unack.Eval, unack.Eval.Type) 465 } 466 return nil 467 } 468 469 // Flush is used to clear the state of the broker 470 func (b *EvalBroker) Flush() { 471 b.l.Lock() 472 defer b.l.Unlock() 473 474 // Unblock any waiters 475 for _, waitCh := range b.waiting { 476 close(waitCh) 477 } 478 b.waiting = make(map[string]chan struct{}) 479 480 // Cancel any Nack timers 481 for _, unack := range b.unack { 482 unack.NackTimer.Stop() 483 } 484 485 // Cancel any time wait evals 486 for _, wait := range b.timeWait { 487 wait.Stop() 488 } 489 490 // Reset the broker 491 b.stats.TotalReady = 0 492 b.stats.TotalUnacked = 0 493 b.stats.TotalBlocked = 0 494 b.stats.TotalWaiting = 0 495 b.stats.ByScheduler = make(map[string]*SchedulerStats) 496 b.evals = make(map[string]int) 497 b.jobEvals = make(map[string]string) 498 b.blocked = make(map[string]PendingEvaluations) 499 b.ready = make(map[string]PendingEvaluations) 500 b.unack = make(map[string]*unackEval) 501 b.timeWait = make(map[string]*time.Timer) 502 } 503 504 // Stats is used to query the state of the broker 505 func (b *EvalBroker) Stats() *BrokerStats { 506 // Allocate a new stats struct 507 stats := new(BrokerStats) 508 stats.ByScheduler = make(map[string]*SchedulerStats) 509 510 b.l.RLock() 511 defer b.l.RUnlock() 512 513 // Copy all the stats 514 stats.TotalReady = b.stats.TotalReady 515 stats.TotalUnacked = b.stats.TotalUnacked 516 stats.TotalBlocked = b.stats.TotalBlocked 517 stats.TotalWaiting = b.stats.TotalWaiting 518 for sched, subStat := range b.stats.ByScheduler { 519 subStatCopy := new(SchedulerStats) 520 *subStatCopy = *subStat 521 stats.ByScheduler[sched] = subStatCopy 522 } 523 return stats 524 } 525 526 // EmitStats is used to export metrics about the broker while enabled 527 func (b *EvalBroker) EmitStats(period time.Duration, stopCh chan struct{}) { 528 for { 529 select { 530 case <-time.After(period): 531 stats := b.Stats() 532 metrics.SetGauge([]string{"nomad", "broker", "total_ready"}, float32(stats.TotalReady)) 533 metrics.SetGauge([]string{"nomad", "broker", "total_unacked"}, float32(stats.TotalUnacked)) 534 metrics.SetGauge([]string{"nomad", "broker", "total_blocked"}, float32(stats.TotalBlocked)) 535 metrics.SetGauge([]string{"nomad", "broker", "total_waiting"}, float32(stats.TotalWaiting)) 536 for sched, schedStats := range stats.ByScheduler { 537 metrics.SetGauge([]string{"nomad", "broker", sched, "ready"}, float32(schedStats.Ready)) 538 metrics.SetGauge([]string{"nomad", "broker", sched, "unacked"}, float32(schedStats.Unacked)) 539 } 540 541 case <-stopCh: 542 return 543 } 544 } 545 } 546 547 // BrokerStats returns all the stats about the broker 548 type BrokerStats struct { 549 TotalReady int 550 TotalUnacked int 551 TotalBlocked int 552 TotalWaiting int 553 ByScheduler map[string]*SchedulerStats 554 } 555 556 // SchedulerStats returns the stats per scheduler 557 type SchedulerStats struct { 558 Ready int 559 Unacked int 560 } 561 562 // Len is for the sorting interface 563 func (p PendingEvaluations) Len() int { 564 return len(p) 565 } 566 567 // Less is for the sorting interface. We flip the check 568 // so that the "min" in the min-heap is the element with the 569 // highest priority 570 func (p PendingEvaluations) Less(i, j int) bool { 571 if p[i].JobID != p[j].JobID && p[i].Priority != p[j].Priority { 572 return !(p[i].Priority < p[j].Priority) 573 } 574 return p[i].CreateIndex < p[j].CreateIndex 575 } 576 577 // Swap is for the sorting interface 578 func (p PendingEvaluations) Swap(i, j int) { 579 p[i], p[j] = p[j], p[i] 580 } 581 582 // Push is used to add a new evalution to the slice 583 func (p *PendingEvaluations) Push(e interface{}) { 584 *p = append(*p, e.(*structs.Evaluation)) 585 } 586 587 // Pop is used to remove an evaluation from the slice 588 func (p *PendingEvaluations) Pop() interface{} { 589 n := len(*p) 590 e := (*p)[n-1] 591 (*p)[n-1] = nil 592 *p = (*p)[:n-1] 593 return e 594 } 595 596 // Peek is used to peek at the next element that would be popped 597 func (p PendingEvaluations) Peek() *structs.Evaluation { 598 n := len(p) 599 if n == 0 { 600 return nil 601 } 602 return p[n-1] 603 }