github.com/hernad/nomad@v1.6.112/nomad/blocked_evals.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package nomad 5 6 import ( 7 "sync" 8 "time" 9 10 "github.com/armon/go-metrics" 11 "github.com/hashicorp/go-hclog" 12 "github.com/hernad/nomad/helper" 13 "github.com/hernad/nomad/nomad/structs" 14 ) 15 16 const ( 17 // unblockBuffer is the buffer size for the unblock channel. The buffer 18 // should be large to ensure that the FSM doesn't block when calling Unblock 19 // as this would apply back-pressure on Raft. 20 unblockBuffer = 8096 21 22 // pruneInterval is the interval at which we prune objects from the 23 // BlockedEvals tracker 24 pruneInterval = 5 * time.Minute 25 26 // pruneThreshold is the threshold after which objects will be pruned. 27 pruneThreshold = 15 * time.Minute 28 ) 29 30 // BlockedEvals is used to track evaluations that shouldn't be queued until a 31 // certain class of nodes becomes available. An evaluation is put into the 32 // blocked state when it is run through the scheduler and produced failed 33 // allocations. It is unblocked when the capacity of a node that could run the 34 // failed allocation becomes available. 35 type BlockedEvals struct { 36 // logger is the logger to use by the blocked eval tracker. 37 logger hclog.Logger 38 39 evalBroker *EvalBroker 40 enabled bool 41 stats *BlockedStats 42 l sync.RWMutex 43 44 // captured is the set of evaluations that are captured by computed node 45 // classes. 46 captured map[string]wrappedEval 47 48 // escaped is the set of evaluations that have escaped computed node 49 // classes. 50 escaped map[string]wrappedEval 51 52 // system is the set of system evaluations that failed to start on nodes because of 53 // resource constraints. 54 system *systemEvals 55 56 // unblockCh is used to buffer unblocking of evaluations. 57 capacityChangeCh chan *capacityUpdate 58 59 // jobs is the map of blocked job and is used to ensure that only one 60 // blocked eval exists for each job. The value is the blocked evaluation ID. 61 jobs map[structs.NamespacedID]string 62 63 // unblockIndexes maps computed node classes or quota name to the index in 64 // which they were unblocked. This is used to check if an evaluation could 65 // have been unblocked between the time they were in the scheduler and the 66 // time they are being blocked. 67 unblockIndexes map[string]uint64 68 69 // duplicates is the set of evaluations for jobs that had pre-existing 70 // blocked evaluations. These should be marked as cancelled since only one 71 // blocked eval is needed per job. 72 duplicates []*structs.Evaluation 73 74 // duplicateCh is used to signal that a duplicate eval was added to the 75 // duplicate set. It can be used to unblock waiting callers looking for 76 // duplicates. 77 duplicateCh chan struct{} 78 79 // timetable is used to correlate indexes with their insertion time. This 80 // allows us to prune based on time. 81 timetable *TimeTable 82 83 // stopCh is used to stop any created goroutines. 84 stopCh chan struct{} 85 } 86 87 // capacityUpdate stores unblock data. 88 type capacityUpdate struct { 89 computedClass string 90 quotaChange string 91 index uint64 92 } 93 94 // wrappedEval captures both the evaluation and the optional token 95 type wrappedEval struct { 96 eval *structs.Evaluation 97 token string 98 } 99 100 // NewBlockedEvals creates a new blocked eval tracker that will enqueue 101 // unblocked evals into the passed broker. 102 func NewBlockedEvals(evalBroker *EvalBroker, logger hclog.Logger) *BlockedEvals { 103 return &BlockedEvals{ 104 logger: logger.Named("blocked_evals"), 105 evalBroker: evalBroker, 106 captured: make(map[string]wrappedEval), 107 escaped: make(map[string]wrappedEval), 108 system: newSystemEvals(), 109 jobs: make(map[structs.NamespacedID]string), 110 unblockIndexes: make(map[string]uint64), 111 capacityChangeCh: make(chan *capacityUpdate, unblockBuffer), 112 duplicateCh: make(chan struct{}, 1), 113 stopCh: make(chan struct{}), 114 stats: NewBlockedStats(), 115 } 116 } 117 118 // Enabled is used to check if the broker is enabled. 119 func (b *BlockedEvals) Enabled() bool { 120 b.l.RLock() 121 defer b.l.RUnlock() 122 return b.enabled 123 } 124 125 // SetEnabled is used to control if the blocked eval tracker is enabled. The 126 // tracker should only be enabled on the active leader. 127 func (b *BlockedEvals) SetEnabled(enabled bool) { 128 b.l.Lock() 129 if b.enabled == enabled { 130 // No-op 131 b.l.Unlock() 132 return 133 } else if enabled { 134 go b.watchCapacity(b.stopCh, b.capacityChangeCh) 135 go b.prune(b.stopCh) 136 } else { 137 close(b.stopCh) 138 } 139 b.enabled = enabled 140 b.l.Unlock() 141 if !enabled { 142 b.Flush() 143 } 144 } 145 146 func (b *BlockedEvals) SetTimetable(timetable *TimeTable) { 147 b.l.Lock() 148 b.timetable = timetable 149 b.l.Unlock() 150 } 151 152 // Block tracks the passed evaluation and enqueues it into the eval broker when 153 // a suitable node calls unblock. 154 func (b *BlockedEvals) Block(eval *structs.Evaluation) { 155 b.processBlock(eval, "") 156 } 157 158 // Reblock tracks the passed evaluation and enqueues it into the eval broker when 159 // a suitable node calls unblock. Reblock should be used over Block when the 160 // blocking is occurring by an outstanding evaluation. The token is the 161 // evaluation's token. 162 func (b *BlockedEvals) Reblock(eval *structs.Evaluation, token string) { 163 b.processBlock(eval, token) 164 } 165 166 // processBlock is the implementation of blocking an evaluation. It supports 167 // taking an optional evaluation token to use when reblocking an evaluation that 168 // may be outstanding. 169 func (b *BlockedEvals) processBlock(eval *structs.Evaluation, token string) { 170 b.l.Lock() 171 defer b.l.Unlock() 172 173 // Do nothing if not enabled 174 if !b.enabled { 175 return 176 } 177 178 // Handle the new evaluation being for a job we are already tracking. 179 if b.processBlockJobDuplicate(eval) { 180 // If process block job duplicate returns true, the new evaluation has 181 // been marked as a duplicate and we have nothing to do, so return 182 // early. 183 return 184 } 185 186 // Check if the eval missed an unblock while it was in the scheduler at an 187 // older index. The scheduler could have been invoked with a snapshot of 188 // state that was prior to additional capacity being added or allocations 189 // becoming terminal. 190 if b.missedUnblock(eval) { 191 // Just re-enqueue the eval immediately. We pass the token so that the 192 // eval_broker can properly handle the case in which the evaluation is 193 // still outstanding. 194 b.evalBroker.EnqueueAll(map[*structs.Evaluation]string{eval: token}) 195 return 196 } 197 198 // Mark the job as tracked. 199 b.jobs[structs.NewNamespacedID(eval.JobID, eval.Namespace)] = eval.ID 200 b.stats.Block(eval) 201 202 // Track that the evaluation is being added due to reaching the quota limit 203 if eval.QuotaLimitReached != "" { 204 b.stats.TotalQuotaLimit++ 205 } 206 207 // Wrap the evaluation, capturing its token. 208 wrapped := wrappedEval{ 209 eval: eval, 210 token: token, 211 } 212 213 // If the eval has escaped, meaning computed node classes could not capture 214 // the constraints of the job, we store the eval separately as we have to 215 // unblock it whenever node capacity changes. This is because we don't know 216 // what node class is feasible for the jobs constraints. 217 if eval.EscapedComputedClass { 218 b.escaped[eval.ID] = wrapped 219 b.stats.TotalEscaped++ 220 return 221 } 222 223 // System evals are indexed by node and re-processed on utilization changes in 224 // existing nodes 225 if eval.Type == structs.JobTypeSystem { 226 b.system.Add(eval, token) 227 } 228 229 // Add the eval to the set of blocked evals whose jobs constraints are 230 // captured by computed node class. 231 b.captured[eval.ID] = wrapped 232 } 233 234 // processBlockJobDuplicate handles the case where the new eval is for a job 235 // that we are already tracking. If the eval is a duplicate, we add the older 236 // evaluation by Raft index to the list of duplicates such that it can be 237 // cancelled. We only ever want one blocked evaluation per job, otherwise we 238 // would create unnecessary work for the scheduler as multiple evals for the 239 // same job would be run, all producing the same outcome. It is critical to 240 // prefer the newer evaluation, since it will contain the most up to date set of 241 // class eligibility. The return value is set to true, if the passed evaluation 242 // is cancelled. This should be called with the lock held. 243 func (b *BlockedEvals) processBlockJobDuplicate(eval *structs.Evaluation) (newCancelled bool) { 244 existingID, hasExisting := b.jobs[structs.NewNamespacedID(eval.JobID, eval.Namespace)] 245 if !hasExisting { 246 return 247 } 248 249 var dup *structs.Evaluation 250 existingW, ok := b.captured[existingID] 251 if ok { 252 if latestEvalIndex(existingW.eval) <= latestEvalIndex(eval) { 253 delete(b.captured, existingID) 254 dup = existingW.eval 255 b.stats.Unblock(dup) 256 } else { 257 dup = eval 258 newCancelled = true 259 } 260 } else { 261 existingW, ok = b.escaped[existingID] 262 if !ok { 263 // This is a programming error 264 b.logger.Error("existing blocked evaluation is neither tracked as captured or escaped", "existing_id", existingID) 265 delete(b.jobs, structs.NewNamespacedID(eval.JobID, eval.Namespace)) 266 return 267 } 268 269 if latestEvalIndex(existingW.eval) <= latestEvalIndex(eval) { 270 delete(b.escaped, existingID) 271 b.stats.TotalEscaped-- 272 dup = existingW.eval 273 } else { 274 dup = eval 275 newCancelled = true 276 } 277 } 278 279 b.duplicates = append(b.duplicates, dup) 280 281 // Unblock any waiter. 282 select { 283 case b.duplicateCh <- struct{}{}: 284 default: 285 } 286 287 return 288 } 289 290 // latestEvalIndex returns the max of the evaluations create and snapshot index 291 func latestEvalIndex(eval *structs.Evaluation) uint64 { 292 if eval == nil { 293 return 0 294 } 295 296 return helper.Max(eval.CreateIndex, eval.SnapshotIndex) 297 } 298 299 // missedUnblock returns whether an evaluation missed an unblock while it was in 300 // the scheduler. Since the scheduler can operate at an index in the past, the 301 // evaluation may have been processed missing data that would allow it to 302 // complete. This method returns if that is the case and should be called with 303 // the lock held. 304 func (b *BlockedEvals) missedUnblock(eval *structs.Evaluation) bool { 305 var max uint64 = 0 306 for id, index := range b.unblockIndexes { 307 // Calculate the max unblock index 308 if max < index { 309 max = index 310 } 311 312 // The evaluation is blocked because it has hit a quota limit not class 313 // eligibility 314 if eval.QuotaLimitReached != "" { 315 if eval.QuotaLimitReached != id { 316 // Not a match 317 continue 318 } else if eval.SnapshotIndex < index { 319 // The evaluation was processed before the quota specification was 320 // updated, so unblock the evaluation. 321 return true 322 } 323 324 // The evaluation was processed having seen all changes to the quota 325 return false 326 } 327 328 elig, ok := eval.ClassEligibility[id] 329 if !ok && eval.SnapshotIndex < index { 330 // The evaluation was processed and did not encounter this class 331 // because it was added after it was processed. Thus for correctness 332 // we need to unblock it. 333 return true 334 } 335 336 // The evaluation could use the computed node class and the eval was 337 // processed before the last unblock. 338 if elig && eval.SnapshotIndex < index { 339 return true 340 } 341 } 342 343 // If the evaluation has escaped, and the map contains an index older than 344 // the evaluations, it should be unblocked. 345 if eval.EscapedComputedClass && eval.SnapshotIndex < max { 346 return true 347 } 348 349 // The evaluation is ahead of all recent unblocks. 350 return false 351 } 352 353 // Untrack causes any blocked evaluation for the passed job to be no longer 354 // tracked. Untrack is called when there is a successful evaluation for the job 355 // and a blocked evaluation is no longer needed. 356 func (b *BlockedEvals) Untrack(jobID, namespace string) { 357 b.l.Lock() 358 defer b.l.Unlock() 359 360 // Do nothing if not enabled 361 if !b.enabled { 362 return 363 } 364 365 nsID := structs.NewNamespacedID(jobID, namespace) 366 367 if evals, ok := b.system.JobEvals(nsID); ok { 368 for _, e := range evals { 369 b.system.Remove(e) 370 b.stats.Unblock(e) 371 } 372 return 373 } 374 375 // Get the evaluation ID to cancel 376 evalID, ok := b.jobs[nsID] 377 if !ok { 378 // No blocked evaluation so exit 379 return 380 } 381 382 // Attempt to delete the evaluation 383 if w, ok := b.captured[evalID]; ok { 384 delete(b.jobs, nsID) 385 delete(b.captured, evalID) 386 b.stats.Unblock(w.eval) 387 if w.eval.QuotaLimitReached != "" { 388 b.stats.TotalQuotaLimit-- 389 } 390 } 391 392 if w, ok := b.escaped[evalID]; ok { 393 delete(b.jobs, nsID) 394 delete(b.escaped, evalID) 395 b.stats.TotalEscaped-- 396 b.stats.Unblock(w.eval) 397 if w.eval.QuotaLimitReached != "" { 398 b.stats.TotalQuotaLimit-- 399 } 400 } 401 } 402 403 // Unblock causes any evaluation that could potentially make progress on a 404 // capacity change on the passed computed node class to be enqueued into the 405 // eval broker. 406 func (b *BlockedEvals) Unblock(computedClass string, index uint64) { 407 b.l.Lock() 408 409 // Do nothing if not enabled 410 if !b.enabled { 411 b.l.Unlock() 412 return 413 } 414 415 // Store the index in which the unblock happened. We use this on subsequent 416 // block calls in case the evaluation was in the scheduler when a trigger 417 // occurred. 418 b.unblockIndexes[computedClass] = index 419 420 // Capture chan in lock as Flush overwrites it 421 ch := b.capacityChangeCh 422 done := b.stopCh 423 b.l.Unlock() 424 425 select { 426 case <-done: 427 case ch <- &capacityUpdate{ 428 computedClass: computedClass, 429 index: index, 430 }: 431 } 432 } 433 434 // UnblockQuota causes any evaluation that could potentially make progress on a 435 // capacity change on the passed quota to be enqueued into the eval broker. 436 func (b *BlockedEvals) UnblockQuota(quota string, index uint64) { 437 // Nothing to do 438 if quota == "" { 439 return 440 } 441 442 b.l.Lock() 443 444 // Do nothing if not enabled 445 if !b.enabled { 446 b.l.Unlock() 447 return 448 } 449 450 // Store the index in which the unblock happened. We use this on subsequent 451 // block calls in case the evaluation was in the scheduler when a trigger 452 // occurred. 453 b.unblockIndexes[quota] = index 454 ch := b.capacityChangeCh 455 done := b.stopCh 456 b.l.Unlock() 457 458 select { 459 case <-done: 460 case ch <- &capacityUpdate{ 461 quotaChange: quota, 462 index: index, 463 }: 464 } 465 } 466 467 // UnblockClassAndQuota causes any evaluation that could potentially make 468 // progress on a capacity change on the passed computed node class or quota to 469 // be enqueued into the eval broker. 470 func (b *BlockedEvals) UnblockClassAndQuota(class, quota string, index uint64) { 471 b.l.Lock() 472 473 // Do nothing if not enabled 474 if !b.enabled { 475 b.l.Unlock() 476 return 477 } 478 479 // Store the index in which the unblock happened. We use this on subsequent 480 // block calls in case the evaluation was in the scheduler when a trigger 481 // occurred. 482 if quota != "" { 483 b.unblockIndexes[quota] = index 484 } 485 b.unblockIndexes[class] = index 486 487 // Capture chan inside the lock to prevent a race with it getting reset 488 // in Flush. 489 ch := b.capacityChangeCh 490 done := b.stopCh 491 b.l.Unlock() 492 493 select { 494 case <-done: 495 case ch <- &capacityUpdate{ 496 computedClass: class, 497 quotaChange: quota, 498 index: index, 499 }: 500 } 501 } 502 503 // UnblockNode finds any blocked evalution that's node specific (system jobs) and enqueues 504 // it on the eval broker 505 func (b *BlockedEvals) UnblockNode(nodeID string, index uint64) { 506 b.l.Lock() 507 defer b.l.Unlock() 508 509 evals, ok := b.system.NodeEvals(nodeID) 510 511 // Do nothing if not enabled 512 if !b.enabled || !ok || len(evals) == 0 { 513 return 514 } 515 516 for e := range evals { 517 b.system.Remove(e) 518 b.stats.Unblock(e) 519 } 520 521 b.evalBroker.EnqueueAll(evals) 522 } 523 524 // watchCapacity is a long lived function that watches for capacity changes in 525 // nodes and unblocks the correct set of evals. 526 func (b *BlockedEvals) watchCapacity(stopCh <-chan struct{}, changeCh <-chan *capacityUpdate) { 527 for { 528 select { 529 case <-stopCh: 530 return 531 case update := <-changeCh: 532 b.unblock(update.computedClass, update.quotaChange, update.index) 533 } 534 } 535 } 536 537 func (b *BlockedEvals) unblock(computedClass, quota string, index uint64) { 538 b.l.Lock() 539 defer b.l.Unlock() 540 541 // Protect against the case of a flush. 542 if !b.enabled { 543 return 544 } 545 546 // Every eval that has escaped computed node class has to be unblocked 547 // because any node could potentially be feasible. 548 numEscaped := len(b.escaped) 549 numQuotaLimit := 0 550 unblocked := make(map[*structs.Evaluation]string, helper.Max(numEscaped, 4)) 551 552 if numEscaped != 0 && computedClass != "" { 553 for id, wrapped := range b.escaped { 554 unblocked[wrapped.eval] = wrapped.token 555 delete(b.escaped, id) 556 delete(b.jobs, structs.NewNamespacedID(wrapped.eval.JobID, wrapped.eval.Namespace)) 557 558 if wrapped.eval.QuotaLimitReached != "" { 559 numQuotaLimit++ 560 } 561 } 562 } 563 564 // We unblock any eval that is explicitly eligible for the computed class 565 // and also any eval that is not eligible or uneligible. This signifies that 566 // when the evaluation was originally run through the scheduler, that it 567 // never saw a node with the given computed class and thus needs to be 568 // unblocked for correctness. 569 for id, wrapped := range b.captured { 570 if quota != "" && wrapped.eval.QuotaLimitReached != quota { 571 // We are unblocking based on quota and this eval doesn't match 572 continue 573 } else if elig, ok := wrapped.eval.ClassEligibility[computedClass]; ok && !elig { 574 // Can skip because the eval has explicitly marked the node class 575 // as ineligible. 576 continue 577 } 578 579 // Unblock the evaluation because it is either for the matching quota, 580 // is eligible based on the computed node class, or never seen the 581 // computed node class. 582 unblocked[wrapped.eval] = wrapped.token 583 delete(b.jobs, structs.NewNamespacedID(wrapped.eval.JobID, wrapped.eval.Namespace)) 584 delete(b.captured, id) 585 if wrapped.eval.QuotaLimitReached != "" { 586 numQuotaLimit++ 587 } 588 } 589 590 if len(unblocked) != 0 { 591 // Update the counters 592 b.stats.TotalEscaped = 0 593 b.stats.TotalQuotaLimit -= numQuotaLimit 594 for eval := range unblocked { 595 b.stats.Unblock(eval) 596 } 597 598 // Enqueue all the unblocked evals into the broker. 599 b.evalBroker.EnqueueAll(unblocked) 600 } 601 } 602 603 // UnblockFailed unblocks all blocked evaluation that were due to scheduler 604 // failure. 605 func (b *BlockedEvals) UnblockFailed() { 606 b.l.Lock() 607 defer b.l.Unlock() 608 609 // Do nothing if not enabled 610 if !b.enabled { 611 return 612 } 613 614 quotaLimit := 0 615 unblocked := make(map[*structs.Evaluation]string, 4) 616 for id, wrapped := range b.captured { 617 if wrapped.eval.TriggeredBy == structs.EvalTriggerMaxPlans { 618 unblocked[wrapped.eval] = wrapped.token 619 delete(b.captured, id) 620 delete(b.jobs, structs.NewNamespacedID(wrapped.eval.JobID, wrapped.eval.Namespace)) 621 if wrapped.eval.QuotaLimitReached != "" { 622 quotaLimit++ 623 } 624 } 625 } 626 627 for id, wrapped := range b.escaped { 628 if wrapped.eval.TriggeredBy == structs.EvalTriggerMaxPlans { 629 unblocked[wrapped.eval] = wrapped.token 630 delete(b.escaped, id) 631 delete(b.jobs, structs.NewNamespacedID(wrapped.eval.JobID, wrapped.eval.Namespace)) 632 b.stats.TotalEscaped -= 1 633 if wrapped.eval.QuotaLimitReached != "" { 634 quotaLimit++ 635 } 636 } 637 } 638 639 if len(unblocked) > 0 { 640 b.stats.TotalQuotaLimit -= quotaLimit 641 for eval := range unblocked { 642 b.stats.Unblock(eval) 643 } 644 645 b.evalBroker.EnqueueAll(unblocked) 646 } 647 } 648 649 // GetDuplicates returns all the duplicate evaluations and blocks until the 650 // passed timeout. 651 func (b *BlockedEvals) GetDuplicates(timeout time.Duration) []*structs.Evaluation { 652 var timeoutTimer *time.Timer 653 var timeoutCh <-chan time.Time 654 SCAN: 655 b.l.Lock() 656 if len(b.duplicates) != 0 { 657 dups := b.duplicates 658 b.duplicates = nil 659 b.l.Unlock() 660 return dups 661 } 662 663 // Capture chans inside the lock to prevent a race with them getting 664 // reset in Flush 665 dupCh := b.duplicateCh 666 stopCh := b.stopCh 667 b.l.Unlock() 668 669 // Create the timer 670 if timeoutTimer == nil && timeout != 0 { 671 timeoutTimer = time.NewTimer(timeout) 672 timeoutCh = timeoutTimer.C 673 defer timeoutTimer.Stop() 674 } 675 676 select { 677 case <-stopCh: 678 return nil 679 case <-timeoutCh: 680 return nil 681 case <-dupCh: 682 goto SCAN 683 } 684 } 685 686 // Flush is used to clear the state of blocked evaluations. 687 func (b *BlockedEvals) Flush() { 688 b.l.Lock() 689 defer b.l.Unlock() 690 691 // Reset the blocked eval tracker. 692 b.stats.TotalEscaped = 0 693 b.stats.TotalBlocked = 0 694 b.stats.TotalQuotaLimit = 0 695 b.stats.BlockedResources = NewBlockedResourcesStats() 696 b.captured = make(map[string]wrappedEval) 697 b.escaped = make(map[string]wrappedEval) 698 b.jobs = make(map[structs.NamespacedID]string) 699 b.unblockIndexes = make(map[string]uint64) 700 b.timetable = nil 701 b.duplicates = nil 702 b.capacityChangeCh = make(chan *capacityUpdate, unblockBuffer) 703 b.stopCh = make(chan struct{}) 704 b.duplicateCh = make(chan struct{}, 1) 705 b.system = newSystemEvals() 706 } 707 708 // Stats is used to query the state of the blocked eval tracker. 709 func (b *BlockedEvals) Stats() *BlockedStats { 710 // Allocate a new stats struct 711 stats := NewBlockedStats() 712 713 b.l.RLock() 714 defer b.l.RUnlock() 715 716 // Copy all the stats 717 stats.TotalEscaped = b.stats.TotalEscaped 718 stats.TotalBlocked = b.stats.TotalBlocked 719 stats.TotalQuotaLimit = b.stats.TotalQuotaLimit 720 stats.BlockedResources = b.stats.BlockedResources.Copy() 721 722 return stats 723 } 724 725 // EmitStats is used to export metrics about the blocked eval tracker while enabled 726 func (b *BlockedEvals) EmitStats(period time.Duration, stopCh <-chan struct{}) { 727 timer, stop := helper.NewSafeTimer(period) 728 defer stop() 729 730 for { 731 timer.Reset(period) 732 733 select { 734 case <-timer.C: 735 stats := b.Stats() 736 metrics.SetGauge([]string{"nomad", "blocked_evals", "total_quota_limit"}, float32(stats.TotalQuotaLimit)) 737 metrics.SetGauge([]string{"nomad", "blocked_evals", "total_blocked"}, float32(stats.TotalBlocked)) 738 metrics.SetGauge([]string{"nomad", "blocked_evals", "total_escaped"}, float32(stats.TotalEscaped)) 739 740 for k, v := range stats.BlockedResources.ByJob { 741 labels := []metrics.Label{ 742 {Name: "namespace", Value: k.Namespace}, 743 {Name: "job", Value: k.ID}, 744 } 745 metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "job", "cpu"}, float32(v.CPU), labels) 746 metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "job", "memory"}, float32(v.MemoryMB), labels) 747 } 748 749 for k, v := range stats.BlockedResources.ByClassInDC { 750 labels := []metrics.Label{ 751 {Name: "datacenter", Value: k.dc}, 752 {Name: "node_class", Value: k.class}, 753 } 754 metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "cpu"}, float32(v.CPU), labels) 755 metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "memory"}, float32(v.MemoryMB), labels) 756 } 757 case <-stopCh: 758 return 759 } 760 } 761 } 762 763 // prune is a long lived function that prunes unnecessary objects on a timer. 764 func (b *BlockedEvals) prune(stopCh <-chan struct{}) { 765 ticker := time.NewTicker(pruneInterval) 766 defer ticker.Stop() 767 768 for { 769 select { 770 case <-stopCh: 771 return 772 case t := <-ticker.C: 773 cutoff := t.UTC().Add(-1 * pruneThreshold) 774 b.pruneUnblockIndexes(cutoff) 775 b.pruneStats(cutoff) 776 } 777 } 778 } 779 780 // pruneUnblockIndexes is used to prune any tracked entry that is excessively 781 // old. This protects againsts unbounded growth of the map. 782 func (b *BlockedEvals) pruneUnblockIndexes(cutoff time.Time) { 783 b.l.Lock() 784 defer b.l.Unlock() 785 786 if b.timetable == nil { 787 return 788 } 789 790 oldThreshold := b.timetable.NearestIndex(cutoff) 791 for key, index := range b.unblockIndexes { 792 if index < oldThreshold { 793 delete(b.unblockIndexes, key) 794 } 795 } 796 } 797 798 // pruneStats is used to prune any zero value stats that are excessively old. 799 func (b *BlockedEvals) pruneStats(cutoff time.Time) { 800 b.l.Lock() 801 defer b.l.Unlock() 802 803 b.stats.prune(cutoff) 804 }