github.com/bigcommerce/nomad@v0.9.3-bc/nomad/blocked_evals.go (about) 1 package nomad 2 3 import ( 4 "sync" 5 "time" 6 7 metrics "github.com/armon/go-metrics" 8 "github.com/hashicorp/consul/lib" 9 log "github.com/hashicorp/go-hclog" 10 "github.com/hashicorp/nomad/helper" 11 "github.com/hashicorp/nomad/nomad/structs" 12 ) 13 14 const ( 15 // unblockBuffer is the buffer size for the unblock channel. The buffer 16 // should be large to ensure that the FSM doesn't block when calling Unblock 17 // as this would apply back-pressure on Raft. 18 unblockBuffer = 8096 19 20 // pruneInterval is the interval at which we prune objects from the 21 // BlockedEvals tracker 22 pruneInterval = 5 * time.Minute 23 24 // pruneThreshold is the threshold after which objects will be pruned. 25 pruneThreshold = 15 * time.Minute 26 ) 27 28 // BlockedEvals is used to track evaluations that shouldn't be queued until a 29 // certain class of nodes becomes available. An evaluation is put into the 30 // blocked state when it is run through the scheduler and produced failed 31 // allocations. It is unblocked when the capacity of a node that could run the 32 // failed allocation becomes available. 33 type BlockedEvals struct { 34 // logger is the logger to use by the blocked eval tracker. 35 logger log.Logger 36 37 evalBroker *EvalBroker 38 enabled bool 39 stats *BlockedStats 40 l sync.RWMutex 41 42 // captured is the set of evaluations that are captured by computed node 43 // classes. 44 captured map[string]wrappedEval 45 46 // escaped is the set of evaluations that have escaped computed node 47 // classes. 48 escaped map[string]wrappedEval 49 50 // unblockCh is used to buffer unblocking of evaluations. 51 capacityChangeCh chan *capacityUpdate 52 53 // jobs is the map of blocked job and is used to ensure that only one 54 // blocked eval exists for each job. The value is the blocked evaluation ID. 55 jobs map[structs.NamespacedID]string 56 57 // unblockIndexes maps computed node classes or quota name to the index in 58 // which they were unblocked. This is used to check if an evaluation could 59 // have been unblocked between the time they were in the scheduler and the 60 // time they are being blocked. 61 unblockIndexes map[string]uint64 62 63 // duplicates is the set of evaluations for jobs that had pre-existing 64 // blocked evaluations. These should be marked as cancelled since only one 65 // blocked eval is needed per job. 66 duplicates []*structs.Evaluation 67 68 // duplicateCh is used to signal that a duplicate eval was added to the 69 // duplicate set. It can be used to unblock waiting callers looking for 70 // duplicates. 71 duplicateCh chan struct{} 72 73 // timetable is used to correlate indexes with their insertion time. This 74 // allows us to prune based on time. 75 timetable *TimeTable 76 77 // stopCh is used to stop any created goroutines. 78 stopCh chan struct{} 79 } 80 81 // capacityUpdate stores unblock data. 82 type capacityUpdate struct { 83 computedClass string 84 quotaChange string 85 index uint64 86 } 87 88 // wrappedEval captures both the evaluation and the optional token 89 type wrappedEval struct { 90 eval *structs.Evaluation 91 token string 92 } 93 94 // BlockedStats returns all the stats about the blocked eval tracker. 95 type BlockedStats struct { 96 // TotalEscaped is the total number of blocked evaluations that have escaped 97 // computed node classes. 98 TotalEscaped int 99 100 // TotalBlocked is the total number of blocked evaluations. 101 TotalBlocked int 102 103 // TotalQuotaLimit is the total number of blocked evaluations that are due 104 // to the quota limit being reached. 105 TotalQuotaLimit int 106 } 107 108 // NewBlockedEvals creates a new blocked eval tracker that will enqueue 109 // unblocked evals into the passed broker. 110 func NewBlockedEvals(evalBroker *EvalBroker, logger log.Logger) *BlockedEvals { 111 return &BlockedEvals{ 112 logger: logger.Named("blocked_evals"), 113 evalBroker: evalBroker, 114 captured: make(map[string]wrappedEval), 115 escaped: make(map[string]wrappedEval), 116 jobs: make(map[structs.NamespacedID]string), 117 unblockIndexes: make(map[string]uint64), 118 capacityChangeCh: make(chan *capacityUpdate, unblockBuffer), 119 duplicateCh: make(chan struct{}, 1), 120 stopCh: make(chan struct{}), 121 stats: new(BlockedStats), 122 } 123 } 124 125 // Enabled is used to check if the broker is enabled. 126 func (b *BlockedEvals) Enabled() bool { 127 b.l.RLock() 128 defer b.l.RUnlock() 129 return b.enabled 130 } 131 132 // SetEnabled is used to control if the blocked eval tracker is enabled. The 133 // tracker should only be enabled on the active leader. 134 func (b *BlockedEvals) SetEnabled(enabled bool) { 135 b.l.Lock() 136 if b.enabled == enabled { 137 // No-op 138 b.l.Unlock() 139 return 140 } else if enabled { 141 go b.watchCapacity(b.stopCh, b.capacityChangeCh) 142 go b.prune(b.stopCh) 143 } else { 144 close(b.stopCh) 145 } 146 b.enabled = enabled 147 b.l.Unlock() 148 if !enabled { 149 b.Flush() 150 } 151 } 152 153 func (b *BlockedEvals) SetTimetable(timetable *TimeTable) { 154 b.l.Lock() 155 b.timetable = timetable 156 b.l.Unlock() 157 } 158 159 // Block tracks the passed evaluation and enqueues it into the eval broker when 160 // a suitable node calls unblock. 161 func (b *BlockedEvals) Block(eval *structs.Evaluation) { 162 b.processBlock(eval, "") 163 } 164 165 // Reblock tracks the passed evaluation and enqueues it into the eval broker when 166 // a suitable node calls unblock. Reblock should be used over Block when the 167 // blocking is occurring by an outstanding evaluation. The token is the 168 // evaluation's token. 169 func (b *BlockedEvals) Reblock(eval *structs.Evaluation, token string) { 170 b.processBlock(eval, token) 171 } 172 173 // processBlock is the implementation of blocking an evaluation. It supports 174 // taking an optional evaluation token to use when reblocking an evaluation that 175 // may be outstanding. 176 func (b *BlockedEvals) processBlock(eval *structs.Evaluation, token string) { 177 b.l.Lock() 178 defer b.l.Unlock() 179 180 // Do nothing if not enabled 181 if !b.enabled { 182 return 183 } 184 185 // Handle the new evaluation being for a job we are already tracking. 186 if b.processBlockJobDuplicate(eval) { 187 // If process block job duplicate returns true, the new evaluation has 188 // been marked as a duplicate and we have nothing to do, so return 189 // early. 190 return 191 } 192 193 // Check if the eval missed an unblock while it was in the scheduler at an 194 // older index. The scheduler could have been invoked with a snapshot of 195 // state that was prior to additional capacity being added or allocations 196 // becoming terminal. 197 if b.missedUnblock(eval) { 198 // Just re-enqueue the eval immediately. We pass the token so that the 199 // eval_broker can properly handle the case in which the evaluation is 200 // still outstanding. 201 b.evalBroker.EnqueueAll(map[*structs.Evaluation]string{eval: token}) 202 return 203 } 204 205 // Mark the job as tracked. 206 b.jobs[structs.NewNamespacedID(eval.JobID, eval.Namespace)] = eval.ID 207 b.stats.TotalBlocked++ 208 209 // Track that the evaluation is being added due to reaching the quota limit 210 if eval.QuotaLimitReached != "" { 211 b.stats.TotalQuotaLimit++ 212 } 213 214 // Wrap the evaluation, capturing its token. 215 wrapped := wrappedEval{ 216 eval: eval, 217 token: token, 218 } 219 220 // If the eval has escaped, meaning computed node classes could not capture 221 // the constraints of the job, we store the eval separately as we have to 222 // unblock it whenever node capacity changes. This is because we don't know 223 // what node class is feasible for the jobs constraints. 224 if eval.EscapedComputedClass { 225 b.escaped[eval.ID] = wrapped 226 b.stats.TotalEscaped++ 227 return 228 } 229 230 // Add the eval to the set of blocked evals whose jobs constraints are 231 // captured by computed node class. 232 b.captured[eval.ID] = wrapped 233 } 234 235 // processBlockJobDuplicate handles the case where the new eval is for a job 236 // that we are already tracking. If the eval is a duplicate, we add the older 237 // evaluation by Raft index to the list of duplicates such that it can be 238 // cancelled. We only ever want one blocked evaluation per job, otherwise we 239 // would create unnecessary work for the scheduler as multiple evals for the 240 // same job would be run, all producing the same outcome. It is critical to 241 // prefer the newer evaluation, since it will contain the most up to date set of 242 // class eligibility. The return value is set to true, if the passed evaluation 243 // is cancelled. This should be called with the lock held. 244 func (b *BlockedEvals) processBlockJobDuplicate(eval *structs.Evaluation) (newCancelled bool) { 245 existingID, hasExisting := b.jobs[structs.NewNamespacedID(eval.JobID, eval.Namespace)] 246 if !hasExisting { 247 return 248 } 249 250 var dup *structs.Evaluation 251 existingW, ok := b.captured[existingID] 252 if ok { 253 if latestEvalIndex(existingW.eval) <= latestEvalIndex(eval) { 254 delete(b.captured, existingID) 255 b.stats.TotalBlocked-- 256 dup = existingW.eval 257 } else { 258 dup = eval 259 newCancelled = true 260 } 261 } else { 262 existingW, ok = b.escaped[existingID] 263 if !ok { 264 // This is a programming error 265 b.logger.Error("existing blocked evaluation is neither tracked as captured or escaped", "existing_id", existingID) 266 delete(b.jobs, structs.NewNamespacedID(eval.JobID, eval.Namespace)) 267 return 268 } 269 270 if latestEvalIndex(existingW.eval) <= latestEvalIndex(eval) { 271 delete(b.escaped, existingID) 272 b.stats.TotalEscaped-- 273 dup = existingW.eval 274 } else { 275 dup = eval 276 newCancelled = true 277 } 278 } 279 280 b.duplicates = append(b.duplicates, dup) 281 282 // Unblock any waiter. 283 select { 284 case b.duplicateCh <- struct{}{}: 285 default: 286 } 287 288 return 289 } 290 291 // latestEvalIndex returns the max of the evaluations create and snapshot index 292 func latestEvalIndex(eval *structs.Evaluation) uint64 { 293 if eval == nil { 294 return 0 295 } 296 297 return helper.Uint64Max(eval.CreateIndex, eval.SnapshotIndex) 298 } 299 300 // missedUnblock returns whether an evaluation missed an unblock while it was in 301 // the scheduler. Since the scheduler can operate at an index in the past, the 302 // evaluation may have been processed missing data that would allow it to 303 // complete. This method returns if that is the case and should be called with 304 // the lock held. 305 func (b *BlockedEvals) missedUnblock(eval *structs.Evaluation) bool { 306 var max uint64 = 0 307 for id, index := range b.unblockIndexes { 308 // Calculate the max unblock index 309 if max < index { 310 max = index 311 } 312 313 // The evaluation is blocked because it has hit a quota limit not class 314 // eligibility 315 if eval.QuotaLimitReached != "" { 316 if eval.QuotaLimitReached != id { 317 // Not a match 318 continue 319 } else if eval.SnapshotIndex < index { 320 // The evaluation was processed before the quota specification was 321 // updated, so unblock the evaluation. 322 return true 323 } 324 325 // The evaluation was processed having seen all changes to the quota 326 return false 327 } 328 329 elig, ok := eval.ClassEligibility[id] 330 if !ok && eval.SnapshotIndex < index { 331 // The evaluation was processed and did not encounter this class 332 // because it was added after it was processed. Thus for correctness 333 // we need to unblock it. 334 return true 335 } 336 337 // The evaluation could use the computed node class and the eval was 338 // processed before the last unblock. 339 if elig && eval.SnapshotIndex < index { 340 return true 341 } 342 } 343 344 // If the evaluation has escaped, and the map contains an index older than 345 // the evaluations, it should be unblocked. 346 if eval.EscapedComputedClass && eval.SnapshotIndex < max { 347 return true 348 } 349 350 // The evaluation is ahead of all recent unblocks. 351 return false 352 } 353 354 // Untrack causes any blocked evaluation for the passed job to be no longer 355 // tracked. Untrack is called when there is a successful evaluation for the job 356 // and a blocked evaluation is no longer needed. 357 func (b *BlockedEvals) Untrack(jobID, namespace string) { 358 b.l.Lock() 359 defer b.l.Unlock() 360 361 // Do nothing if not enabled 362 if !b.enabled { 363 return 364 } 365 366 nsID := structs.NewNamespacedID(jobID, namespace) 367 368 // Get the evaluation ID to cancel 369 evalID, ok := b.jobs[nsID] 370 if !ok { 371 // No blocked evaluation so exit 372 return 373 } 374 375 // Attempt to delete the evaluation 376 if w, ok := b.captured[evalID]; ok { 377 delete(b.jobs, nsID) 378 delete(b.captured, evalID) 379 b.stats.TotalBlocked-- 380 if w.eval.QuotaLimitReached != "" { 381 b.stats.TotalQuotaLimit-- 382 } 383 } 384 385 if w, ok := b.escaped[evalID]; ok { 386 delete(b.jobs, nsID) 387 delete(b.escaped, evalID) 388 b.stats.TotalEscaped-- 389 b.stats.TotalBlocked-- 390 if w.eval.QuotaLimitReached != "" { 391 b.stats.TotalQuotaLimit-- 392 } 393 } 394 } 395 396 // Unblock causes any evaluation that could potentially make progress on a 397 // capacity change on the passed computed node class to be enqueued into the 398 // eval broker. 399 func (b *BlockedEvals) Unblock(computedClass string, index uint64) { 400 b.l.Lock() 401 402 // Do nothing if not enabled 403 if !b.enabled { 404 b.l.Unlock() 405 return 406 } 407 408 // Store the index in which the unblock happened. We use this on subsequent 409 // block calls in case the evaluation was in the scheduler when a trigger 410 // occurred. 411 b.unblockIndexes[computedClass] = index 412 b.l.Unlock() 413 414 b.capacityChangeCh <- &capacityUpdate{ 415 computedClass: computedClass, 416 index: index, 417 } 418 } 419 420 // UnblockQuota causes any evaluation that could potentially make progress on a 421 // capacity change on the passed quota to be enqueued into the eval broker. 422 func (b *BlockedEvals) UnblockQuota(quota string, index uint64) { 423 // Nothing to do 424 if quota == "" { 425 return 426 } 427 428 b.l.Lock() 429 430 // Do nothing if not enabled 431 if !b.enabled { 432 b.l.Unlock() 433 return 434 } 435 436 // Store the index in which the unblock happened. We use this on subsequent 437 // block calls in case the evaluation was in the scheduler when a trigger 438 // occurred. 439 b.unblockIndexes[quota] = index 440 b.l.Unlock() 441 442 b.capacityChangeCh <- &capacityUpdate{ 443 quotaChange: quota, 444 index: index, 445 } 446 } 447 448 // UnblockClassAndQuota causes any evaluation that could potentially make 449 // progress on a capacity change on the passed computed node class or quota to 450 // be enqueued into the eval broker. 451 func (b *BlockedEvals) UnblockClassAndQuota(class, quota string, index uint64) { 452 b.l.Lock() 453 454 // Do nothing if not enabled 455 if !b.enabled { 456 b.l.Unlock() 457 return 458 } 459 460 // Store the index in which the unblock happened. We use this on subsequent 461 // block calls in case the evaluation was in the scheduler when a trigger 462 // occurred. 463 if quota != "" { 464 b.unblockIndexes[quota] = index 465 } 466 b.unblockIndexes[class] = index 467 468 // Capture chan inside the lock to prevent a race with it getting reset 469 // in Flush. 470 ch := b.capacityChangeCh 471 b.l.Unlock() 472 473 ch <- &capacityUpdate{ 474 computedClass: class, 475 quotaChange: quota, 476 index: index, 477 } 478 } 479 480 // watchCapacity is a long lived function that watches for capacity changes in 481 // nodes and unblocks the correct set of evals. 482 func (b *BlockedEvals) watchCapacity(stopCh <-chan struct{}, changeCh <-chan *capacityUpdate) { 483 for { 484 select { 485 case <-stopCh: 486 return 487 case update := <-changeCh: 488 b.unblock(update.computedClass, update.quotaChange, update.index) 489 } 490 } 491 } 492 493 func (b *BlockedEvals) unblock(computedClass, quota string, index uint64) { 494 b.l.Lock() 495 defer b.l.Unlock() 496 497 // Protect against the case of a flush. 498 if !b.enabled { 499 return 500 } 501 502 // Every eval that has escaped computed node class has to be unblocked 503 // because any node could potentially be feasible. 504 numEscaped := len(b.escaped) 505 numQuotaLimit := 0 506 unblocked := make(map[*structs.Evaluation]string, lib.MaxInt(numEscaped, 4)) 507 508 if numEscaped != 0 && computedClass != "" { 509 for id, wrapped := range b.escaped { 510 unblocked[wrapped.eval] = wrapped.token 511 delete(b.escaped, id) 512 delete(b.jobs, structs.NewNamespacedID(wrapped.eval.JobID, wrapped.eval.Namespace)) 513 514 if wrapped.eval.QuotaLimitReached != "" { 515 numQuotaLimit++ 516 } 517 } 518 } 519 520 // We unblock any eval that is explicitly eligible for the computed class 521 // and also any eval that is not eligible or uneligible. This signifies that 522 // when the evaluation was originally run through the scheduler, that it 523 // never saw a node with the given computed class and thus needs to be 524 // unblocked for correctness. 525 for id, wrapped := range b.captured { 526 if quota != "" && wrapped.eval.QuotaLimitReached != quota { 527 // We are unblocking based on quota and this eval doesn't match 528 continue 529 } else if elig, ok := wrapped.eval.ClassEligibility[computedClass]; ok && !elig { 530 // Can skip because the eval has explicitly marked the node class 531 // as ineligible. 532 continue 533 } 534 535 // Unblock the evaluation because it is either for the matching quota, 536 // is eligible based on the computed node class, or never seen the 537 // computed node class. 538 unblocked[wrapped.eval] = wrapped.token 539 delete(b.jobs, structs.NewNamespacedID(wrapped.eval.JobID, wrapped.eval.Namespace)) 540 delete(b.captured, id) 541 if wrapped.eval.QuotaLimitReached != "" { 542 numQuotaLimit++ 543 } 544 } 545 546 if l := len(unblocked); l != 0 { 547 // Update the counters 548 b.stats.TotalEscaped = 0 549 b.stats.TotalBlocked -= l 550 b.stats.TotalQuotaLimit -= numQuotaLimit 551 552 // Enqueue all the unblocked evals into the broker. 553 b.evalBroker.EnqueueAll(unblocked) 554 } 555 } 556 557 // UnblockFailed unblocks all blocked evaluation that were due to scheduler 558 // failure. 559 func (b *BlockedEvals) UnblockFailed() { 560 b.l.Lock() 561 defer b.l.Unlock() 562 563 // Do nothing if not enabled 564 if !b.enabled { 565 return 566 } 567 568 quotaLimit := 0 569 unblocked := make(map[*structs.Evaluation]string, 4) 570 for id, wrapped := range b.captured { 571 if wrapped.eval.TriggeredBy == structs.EvalTriggerMaxPlans { 572 unblocked[wrapped.eval] = wrapped.token 573 delete(b.captured, id) 574 delete(b.jobs, structs.NewNamespacedID(wrapped.eval.JobID, wrapped.eval.Namespace)) 575 if wrapped.eval.QuotaLimitReached != "" { 576 quotaLimit++ 577 } 578 } 579 } 580 581 for id, wrapped := range b.escaped { 582 if wrapped.eval.TriggeredBy == structs.EvalTriggerMaxPlans { 583 unblocked[wrapped.eval] = wrapped.token 584 delete(b.escaped, id) 585 delete(b.jobs, structs.NewNamespacedID(wrapped.eval.JobID, wrapped.eval.Namespace)) 586 b.stats.TotalEscaped -= 1 587 if wrapped.eval.QuotaLimitReached != "" { 588 quotaLimit++ 589 } 590 } 591 } 592 593 if l := len(unblocked); l > 0 { 594 b.stats.TotalBlocked -= l 595 b.stats.TotalQuotaLimit -= quotaLimit 596 b.evalBroker.EnqueueAll(unblocked) 597 } 598 } 599 600 // GetDuplicates returns all the duplicate evaluations and blocks until the 601 // passed timeout. 602 func (b *BlockedEvals) GetDuplicates(timeout time.Duration) []*structs.Evaluation { 603 var timeoutTimer *time.Timer 604 var timeoutCh <-chan time.Time 605 SCAN: 606 b.l.Lock() 607 if len(b.duplicates) != 0 { 608 dups := b.duplicates 609 b.duplicates = nil 610 b.l.Unlock() 611 return dups 612 } 613 614 // Capture chans inside the lock to prevent a race with them getting 615 // reset in Flush 616 dupCh := b.duplicateCh 617 stopCh := b.stopCh 618 b.l.Unlock() 619 620 // Create the timer 621 if timeoutTimer == nil && timeout != 0 { 622 timeoutTimer = time.NewTimer(timeout) 623 timeoutCh = timeoutTimer.C 624 defer timeoutTimer.Stop() 625 } 626 627 select { 628 case <-stopCh: 629 return nil 630 case <-timeoutCh: 631 return nil 632 case <-dupCh: 633 goto SCAN 634 } 635 } 636 637 // Flush is used to clear the state of blocked evaluations. 638 func (b *BlockedEvals) Flush() { 639 b.l.Lock() 640 defer b.l.Unlock() 641 642 // Reset the blocked eval tracker. 643 b.stats.TotalEscaped = 0 644 b.stats.TotalBlocked = 0 645 b.stats.TotalQuotaLimit = 0 646 b.captured = make(map[string]wrappedEval) 647 b.escaped = make(map[string]wrappedEval) 648 b.jobs = make(map[structs.NamespacedID]string) 649 b.unblockIndexes = make(map[string]uint64) 650 b.timetable = nil 651 b.duplicates = nil 652 b.capacityChangeCh = make(chan *capacityUpdate, unblockBuffer) 653 b.stopCh = make(chan struct{}) 654 b.duplicateCh = make(chan struct{}, 1) 655 } 656 657 // Stats is used to query the state of the blocked eval tracker. 658 func (b *BlockedEvals) Stats() *BlockedStats { 659 // Allocate a new stats struct 660 stats := new(BlockedStats) 661 662 b.l.RLock() 663 defer b.l.RUnlock() 664 665 // Copy all the stats 666 stats.TotalEscaped = b.stats.TotalEscaped 667 stats.TotalBlocked = b.stats.TotalBlocked 668 stats.TotalQuotaLimit = b.stats.TotalQuotaLimit 669 return stats 670 } 671 672 // EmitStats is used to export metrics about the blocked eval tracker while enabled 673 func (b *BlockedEvals) EmitStats(period time.Duration, stopCh <-chan struct{}) { 674 for { 675 select { 676 case <-time.After(period): 677 stats := b.Stats() 678 metrics.SetGauge([]string{"nomad", "blocked_evals", "total_quota_limit"}, float32(stats.TotalQuotaLimit)) 679 metrics.SetGauge([]string{"nomad", "blocked_evals", "total_blocked"}, float32(stats.TotalBlocked)) 680 metrics.SetGauge([]string{"nomad", "blocked_evals", "total_escaped"}, float32(stats.TotalEscaped)) 681 case <-stopCh: 682 return 683 } 684 } 685 } 686 687 // prune is a long lived function that prunes unnecessary objects on a timer. 688 func (b *BlockedEvals) prune(stopCh <-chan struct{}) { 689 ticker := time.NewTicker(pruneInterval) 690 defer ticker.Stop() 691 692 for { 693 select { 694 case <-stopCh: 695 return 696 case <-ticker.C: 697 b.pruneUnblockIndexes() 698 } 699 } 700 } 701 702 // pruneUnblockIndexes is used to prune any tracked entry that is excessively 703 // old. This protects againsts unbounded growth of the map. 704 func (b *BlockedEvals) pruneUnblockIndexes() { 705 b.l.Lock() 706 defer b.l.Unlock() 707 708 if b.timetable == nil { 709 return 710 } 711 712 cutoff := time.Now().UTC().Add(-1 * pruneThreshold) 713 oldThreshold := b.timetable.NearestIndex(cutoff) 714 715 for key, index := range b.unblockIndexes { 716 if index < oldThreshold { 717 delete(b.unblockIndexes, key) 718 } 719 } 720 }